bs4.BeautifulSoup
#warningmessage-somethimesworkingwithpythonpacakge
#whichareoutofdateorneedtobeupdatedortobeinlatestversion
#soifweareusingthepythonpacakgenotconsistingoftheabovethingsthen
#warningmessagewillappear
#warningmessagewillnothaveanyaffectontheexecution
#sotorectifythewarningmessageweareimportingthewarningslibrary
#herewehaveimportedthewarningslibrary
#thenfromwarningslibrarywehaveimportedthemodulecalledfilterwarnings("ignore")
importwarnings
warnings.filterwarnings("ignore")
#importingtherequiredlibraries
#beautifulsoupisthelibrarythroughwhichwecandothetaskofwebscraping
#mostsignigicantandmostoptimallibraryisthebeautifulsoup
#importednumpyandthepandas
#bs4standsforbeautifulsoupversion4
#frombs4libraryweareimportingthemethodcalledbeautifulsoup
#thenweareimportingrequest-requestwillgeneratetherequestsfromthewebsites
#interentorthenetwork
#everymessageisgeneratedingetandpostmethod
#importingcsv-oncewegotthedatafromthewebsite
#sotoconvertthedataintothereadableformat&weareconvertingthedataintothecsv
#importingre-restandsforregularexpressions-theexpressionshouldbesymantically
#syntaxcallycorrectsotomanageallthiscorrectnessreisused
#importurllib.requestasurllib2-hereweareimportingurllib.requestlibrary
#andfromhereweareimportingthemodulecalledurllib2
#everywebsiteworksonURL,URLmeansuniformresourcelocator
#usingtheURLwecandirectlylocatethewebpage
#fromdatetimeimportdatetime-hereweareimportingdatetimeanthemodule
#weareimportingcalleddatetime.
#datetimeisusedtohandeorsetthedateandtime
#os-operatingsystem-werequiredsomemethodsweneedthelibrarycalledos
#importingsys-sysstandsforsystemrequirement,itistocompletethesyetemdependency
##hereweareimportingthelibrarycalledmatplotlib.pyplot
#asplttocreatethechartsandvisualizethem
#hereweareimportingthelibrarycalledmatplotlib.imageasmpimgtoworkwiththeimages
importnumpyasnp
importpandasaspd
frombs4importBeautifulSoup
importrequests
importcsv
importre
importurllib.requestasurllib2
fromdatetimeimportdatetime
importos
importsys
importmatplotlib.pyplotasplt
importmatplotlib.imageasmpimg
#herefirstwehavepassedtheurl'https://en.wikipedia.org/wiki/Healthcare_in_Europe'
#wearetakingtheurlofhealthcare
#thenwearecreatingthevariablecalledrandwearerequestingtogettheURL
#abdthenwehavecreatedthenewvariablecalledandconvertingthedata
#whichwegotfromtheurltothebeautifulsoup
#andsimplycheckingthetypeofthevariablewehavecreatedthatisHCE
#randHCEarethevariableswehavecreated
#oncewehavegottheoutputthatmeansthedataisintheformatofbeautifulsoup
url='https://en.wikipedia.org/wiki/Healthcare_in_Europe'
r=requests.get(url)
HCE=BeautifulSoup(r.text)
type(HCE)
#firstweareusingthewebpagemethodandopeningtheurl
#byusingtheurlopenmethodfromtheurllib2thatweimported
#andtheopentheurltoreadthedata
#thenwehavecreatedthevariablecalledthehtmlpageandfromthewebpagemethod
#weareusingthefunctioncalledreadlinesthatmeanswearereadingthetextfromthewebpages
#thenwearegoingtostoreallthetextinthelistcalledlst
#usingtheforloopinthehtmlpagewearereadingtheentireline
#thisforloopworksfortheentirepage
#thenwehavecreatedthevariablecalledlineinthehtmlpage
#whateverlinewereadthatlinewillbeconvertedtothestringformat
#andalsothatlineswillbestripedthatmeansweareremovingtheunwantedspaces.
#thenwearepassingoneconditionthatisiftheregularexpression(re)ishavingtheline
#intheformatoftableclassexistinthetextthenwewanttoappendthatline
#lst=[]-blankbracketisusedbecausewhateverlineswillbeappendwillbestoredinthelst
webpage=urllib2.urlopen(url)
htmlpage=webpage.readlines()
lst=[]
forlineinhtmlpage:
line=str(line).rstrip()
5
['b\'<tableclass="wikitablefloatrightsortable"style="font-size:90%">\\n\'',
'b\'<divclass="navbox-styles"><styledata-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output.hlis
tdl,.mw-parser-output.hlistol,.mw-parser-output.hlistul{margin:0;padding:0}.mw-parser-output.hlistdd,.mw
-parser-output.hlistdt,.mw-parser-output.hlistli{margin:0;display:inline}.mw-parser-output.hlist.inline,.m
w-parser-output.hlist.inlinedl,.mw-parser-output.hlist.inlineol,.mw-parser-output.hlist.inlineul,.mw-pars
er-output.hlistdldl,.mw-parser-output.hlistdlol,.mw-parser-output.hlistdlul,.mw-parser-output.hlisto
ldl,.mw-parser-output.hlistolol,.mw-parser-output.hlistolul,.mw-parser-output.hlistuldl,.mw-parser-ou
tput.hlistulol,.mw-parser-output.hlistulul{display:inline}.mw-parser-output.hlist.mw-empty-li{display:n
one}.mw-parser-output.hlistdt::after{content:":"}.mw-parser-output.hlistdd::after,.mw-parser-output.hlist
li::after{content:"\\xc2\\xb7";font-weight:bold}.mw-parser-output.hlistdd:last-child::after,.mw-parser-outp
ut.hlistdt:last-child::after,.mw-parser-output.hlistli:last-child::after{content:none}.mw-parser-output.hl
istdddd:first-child::before,.mw-parser-output.hlistdddt:first-child::before,.mw-parser-output.hlistddli
:first-child::before,.mw-parser-output.hlistdtdd:first-child::before,.mw-parser-output.hlistdtdt:first-ch
ild::before,.mw-parser-output.hlistdtli:first-child::before,.mw-parser-output.hlistlidd:first-child::befo
re,.mw-parser-output.hlistlidt:first-child::before,.mw-parser-output.hlistlili:first-child::before{conten
t:"(";font-weight:normal}.mw-parser-output.hlistdddd:last-child::after,.mw-parser-output.hlistdddt:last-
child::after,.mw-parser-output.hlistddli:last-child::after,.mw-parser-output.hlistdtdd:last-child::after,
.mw-parser-output.hlistdtdt:last-child::after,.mw-parser-output.hlistdtli:last-child::after,.mw-parser-ou
tput.hlistlidd:last-child::after,.mw-parser-output.hlistlidt:last-child::after,.mw-parser-output.hlistl
ili:last-child::after{content:")";font-weight:normal}.mw-parser-output.hlistol{counter-reset:listitem}.mw-pa
rser-output.hlistol>li{counter-increment:listitem}.mw-parser-output.hlistol>li::before{content:""counter(l
istitem)"\\\\a0"}.mw-parser-output.hlistddol>li:first-child::before,.mw-parser-output.hlistdtol>li:first
-child::before,.mw-parser-output.hlistliol>li:first-child::before{content:"("counter(listitem)"\\\\a0"}</s
tyle><styledata-mw-deduplicate="TemplateStyles:r1061467846">.mw-parser-output.navbox{box-sizing:border-box;bo
rder:1pxsolid#a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1emauto0}.mw-
parser-output.navbox.navbox{margin-top:0}.mw-parser-output.navbox+.navbox,.mw-parser-output.navbox+.navbox-
styles+.navbox{margin-top:-1px}.mw-parser-output.navbox-inner,.mw-parser-output.navbox-subgroup{width:100%}.m
w-parser-output.navbox-group,.mw-parser-output.navbox-title,.mw-parser-output.navbox-abovebelow{padding:0.25
em1em;line-height:1.5em;text-align:center}.mw-parser-output.navbox-group{white-space:nowrap;text-align:right}
.mw-parser-output.navbox,.mw-parser-output.navbox-subgroup{background-color:#fdfdfd}.mw-parser-output.navbox
-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output.navbox-list-with-group{text-align:left;border-l
eft-width:2px;border-left-style:solid}.mw-parser-outputtr+tr>.navbox-abovebelow,.mw-parser-outputtr+tr>.navbo
x-group,.mw-parser-outputtr+tr>.navbox-image,.mw-parser-outputtr+tr>.navbox-list{border-top:2pxsolid#fdfdfd
}.mw-parser-output.navbox-title{background-color:#ccf}.mw-parser-output.navbox-abovebelow,.mw-parser-output.
navbox-group,.mw-parser-output.navbox-subgroup.navbox-title{background-color:#ddf}.mw-parser-output.navbox-s
ubgroup.navbox-group,.mw-parser-output.navbox-subgroup.navbox-abovebelow{background-color:#e6e6ff}.mw-parser
-output.navbox-even{background-color:#f7f7f7}.mw-parser-output.navbox-odd{background-color:transparent}.mw-pa
rser-output.navbox.hlisttddl,.mw-parser-output.navbox.hlisttdol,.mw-parser-output.navbox.hlisttdul,
.mw-parser-output.navboxtd.hlistdl,.mw-parser-output.navboxtd.hlistol,.mw-parser-output.navboxtd.hlist
ul{padding:0.125em0}.mw-parser-output.navbox.navbar{display:block;font-size:100%}.mw-parser-output.navbox-t
itle.navbar{float:left;text-align:left;margin-right:0.5em}</style></div><divrole="navigation"class="navbox"
aria-labelledby="Healthcare_in_Europe"style="padding:3px"><tableclass="nowraplinksmw-collapsibleautocollaps
enavbox-inner"style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><thscope="col"class=
"navbox-title"colspan="2"><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1129693374"/>
<styledata-mw-deduplicate="TemplateStyles:r1063604349">.mw-parser-output.navbar{display:inline;font-size:88%;
font-weight:normal}.mw-parser-output.navbar-collapse{float:left;text-align:left}.mw-parser-output.navbar-boxt
ext{word-spacing:0}.mw-parser-output.navbarul{display:inline-block;white-space:nowrap;line-height:inherit}.mw
-parser-output.navbar-brackets::before{margin-right:-0.125em;content:"["}.mw-parser-output.navbar-brackets::
after{margin-left:-0.125em;content:"]"}.mw-parser-output.navbarli{word-spacing:-0.125em}.mw-parser-output.n
avbara>span,.mw-parser-output.navbara>abbr{text-decoration:inherit}.mw-parser-output.navbar-miniabbr{font-
variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output.navbar-ct-full{fon
t-size:114%;margin:07em}.mw-parser-output.navbar-ct-mini{font-size:114%;margin:04em}</style><divclass="navb
arplainlinkshlistnavbar-mini"><ul><liclass="nv-view"><ahref="/wiki/Template:Healthcare_in_Europe"title="T
emplate:HealthcareinEurope"><abbrtitle="Viewthistemplate"style=";;background:nonetransparent;border:none
;box-shadow:none;padding:0;">v</abbr></a></li><liclass="nv-talk"><ahref="/wiki/Template_talk:Healthcare_in_Eu
rope"title="Templatetalk:HealthcareinEurope"><abbrtitle="Discussthistemplate"style=";;background:nonet
ransparent;border:none;box-shadow:none;padding:0;">t</abbr></a></li><liclass="nv-edit"><aclass="externaltext
"href="https://en.wikipedia.org/w/index.php?title=Template:Healthcare_in_Europe&amp;action=edit"><abbrtitle="
Editthistemplate"style=";;background:nonetransparent;border:none;box-shadow:none;padding:0;">e</abbr></a></
li></ul></div><divid="Healthcare_in_Europe"style="font-size:114%;margin:04em"><aclass="mw-selflinkselflink
">HealthcareinEurope</a></div></th></tr><tr><thscope="row"class="navbox-group"style="width:1%">Sovereign
states</th><tdclass="navbox-list-with-groupnavbox-listnavbox-oddhlist"style="width:100%;padding:0"><divst
yle="padding:00.25em">\\n\'',
'b\'<divclass="navbox-styles"><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r11296933
74"/><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1061467846"/></div><divrole="navig
ation"class="navbox"aria-labelledby="Health_in_Europe"style="padding:3px"><tableclass="nowraplinksmw-colla
psibleautocollapsenavbox-inner"style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th
scope="col"class="navbox-title"colspan="2"><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateSty
les:r1129693374"/><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1063604349"/><divclas
s="navbarplainlinkshlistnavbar-mini"><ul><liclass="nv-view"><ahref="/wiki/Template:Europe_topic"title="Te
mplate:Europetopic"><abbrtitle="Viewthistemplate"style=";;background:nonetransparent;border:none;box-shad
ow:none;padding:0;">v</abbr></a></li><liclass="nv-talk"><ahref="/wiki/Template_talk:Europe_topic"title="Temp
latetalk:Europetopic"><abbrtitle="Discussthistemplate"style=";;background:nonetransparent;border:none;bo
x-shadow:none;padding:0;">t</abbr></a></li><liclass="nv-edit"><aclass="externaltext"href="https://en.wikipe
dia.org/w/index.php?title=Template:Europe_topic&amp;action=edit"><abbrtitle="Editthistemplate"style=";;back
ground:nonetransparent;border:none;box-shadow:none;padding:0;">e</abbr></a></li></ul></div><divid="Health_in_
Europe"style="font-size:114%;margin:04em"><ahref="/wiki/Health_in_Europe"class="mw-redirect"title="Health
ifre.search('tableclass',line):
lst.append(line)
#checkingthelengthofthelist
len(lst)
#checkedforthelst
lst
inEurope">HealthinEurope</a></div></th></tr><tr><thscope="row"class="navbox-group"style="width:1%">Sover
eignstates</th><tdclass="navbox-list-with-groupnavbox-listnavbox-oddhlist"style="width:100%;padding:0"><d
ivstyle="padding:00.25em">\\n\'',
'b\'<divclass="navbox-styles"><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r11296933
74"/><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1061467846"/></div><divrole="navig
ation"class="navbox"aria-labelledby="List_of_hospitals_in_Europe"style="padding:3px"><tableclass="nowraplin
ksmw-collapsibleautocollapsenavbox-inner"style="border-spacing:0;background:transparent;color:inherit"><tbo
dy><tr><thscope="col"class="navbox-title"colspan="2"><linkrel="mw-deduplicated-inline-style"href="mw-data:
TemplateStyles:r1129693374"/><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1063604349"
/><divclass="navbarplainlinkshlistnavbar-mini"><ul><liclass="nv-view"><ahref="/wiki/Template:Europe_topic
"title="Template:Europetopic"><abbrtitle="Viewthistemplate"style=";;background:nonetransparent;border:no
ne;box-shadow:none;padding:0;">v</abbr></a></li><liclass="nv-talk"><ahref="/wiki/Template_talk:Europe_topic"
title="Templatetalk:Europetopic"><abbrtitle="Discussthistemplate"style=";;background:nonetransparent;bor
der:none;box-shadow:none;padding:0;">t</abbr></a></li><liclass="nv-edit"><aclass="externaltext"href="https:
//en.wikipedia.org/w/index.php?title=Template:Europe_topic&amp;action=edit"><abbrtitle="Editthistemplate"st
yle=";;background:nonetransparent;border:none;box-shadow:none;padding:0;">e</abbr></a></li></ul></div><divid=
"List_of_hospitals_in_Europe"style="font-size:114%;margin:04em"><ahref="/wiki/List_of_hospitals_in_Europe"c
lass="mw-redirect"title="ListofhospitalsinEurope">ListofhospitalsinEurope</a></div></th></tr><tr><th
scope="row"class="navbox-group"style="width:1%">Sovereignstates</th><tdclass="navbox-list-with-groupnavbox
-listnavbox-oddhlist"style="width:100%;padding:0"><divstyle="padding:00.25em">\\n\'',
'b\'<divclass="navbox-styles"><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r11296933
74"/><linkrel="mw-deduplicated-inline-style"href="mw-data:TemplateStyles:r1061467846"/></div><divrole="navig
ation"class="navboxauthority-control"aria-label="Navbox"style="padding:3px"><tableclass="nowraplinkshlist
navbox-inner"style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><thscope="row"class="n
avbox-group"style="width:1%"><ahref="/wiki/Help:Authority_control"title="Help:Authoritycontrol">Authorityc
ontrol</a>:Nationallibraries<ahref="https://www.wikidata.org/wiki/Q5691262#identifiers"title="Editthisat
Wikidata"><imgalt="EditthisatWikidata"src="//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edi
t-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png"decoding="async"width="10"height="10"s
tyle="vertical-align:text-top"class="noprint"srcset="//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_
icon_edit-ltr-progressive.svg/15px-OOjs_UI_icon_edit-ltr-progressive.svg.png1.5x,//upload.wikimedia.org/wikip
edia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/20px-OOjs_UI_icon_edit-ltr-progressive.svg.png2x"dat
a-file-width="20"data-file-height="20"/></a></th><tdclass="navbox-list-with-groupnavbox-listnavbox-odd"st
yle="width:100%;padding:0"><divstyle="padding:00.25em">\\n\'']
bs4.element.Tag
bs4.element.Tag
#firstypuwillgettothenormalpage
#thenselectthedatayouwantfromthepage
#onceyougetthethedatasimplyreadthedaatausingurllibrary
#whatevertableispresentinthetextwearefindingitusingthetablemethod
#findfunction-itisusedtoreadthetable
#here1stwehavecreatedthevariablecalledHCEthatisforhealthcareineurope
#thenbyusingthetablemethodwearefindingthetablefromthevariableHCE
#andwearedefiningtheclasstogetthenameofthetableundertheclass.
#wikitablefloatrightsortable-thisisthenameofthetable
table=HCE.find('table',{'class','wikitablefloatrightsortable'})
#checkingthetypeofthetable
#elementtagisthetagthatisautomaticallyprvidedbythebs4
type(table)
#'"([^"]*)"'-thisiscalledthegrammeroftext
#'"([^"]*)"'-thisaretheexpressionusedintheregularexpressionstodefinthegrammer
#inNLPthegrammeroftextisexplained
#here1stwehavecreatedthevariablecalledxandweareusingthepreviouslydefinedlist
#extr-thisisusedforextract
#thenwewanttoextractalltheregularexpressionsrebyusingthe
#findallmethodinthexorthelistwecreated
#thencreatedthetablenamedHCEandfromtherewearefindingthetableclass
#andputitintheextractlist
#soherewehaveextractedtheegularexpresions
#byusingthefindallmethodandputitintotheextractedlist.
#'"([^"]*)"'
#re-intheregularexpressionwearegoingtofindtheentirethings
#[]-hereitisgivingusthelist,squarebracketanditisdefiningthetableclaas
##*-starmenasentirewhichcomesunderthelist,squarebracketortableclaas
#wearesayingwhateverisinsidethetablewewanttoextractthatpart
x=lst[0]
extr=re.findall('"([^"]*)"',x)
table=HCE.find('table',{'class',extr[0]})
#checkingthetypeofthetable
type(table)
#readingtheheadersandtherowsofthenamesseperately
#wehavecreatedthevariablecalledheaders
#thenwearewritingtheforloopforheader.text,header
#.textandforloopweareexecutingforheader
#basicallyheader.textistheinputfortheforloop
['WorldRank\n','EURank\n','Country\n','Lifeexpectancyatbirth(years)\n']
WorldRank\n EURank\n Country\n Lifeexpectancyatbirth(years)\n
0 None None None None
1 5.\n 1.\n Spain\n 83.4\n
2 6.\n 2.\n Italy\n 83.4\n
3 11.\n 3.\n Sweden\n 82.7\n
4 12.\n 4.\n France\n 82.5\n
5 13.\n 5.\n Malta\n 82.4\n
6 16.\n 6.\n Ireland\n 82.1\n
#thenwewillcomparethisheaderwiththecomparisionoperatorin
#inisthecomparisionoperator
#anditwillalltheth.sowhereeverwearegettingthwewillfindthat
#wearecreatingoneheaderfortheforloopinthetable
#andbyusingthefindallmethod[findallisthestringmethod]and
#[thisthepartofthehtml,wecalleditasthead]
#thatweareextractingthatcolumnsusingthethortheadmethod
#basicallytheadthatisthstorestheIDs,columnsnamesinthehtmlpage
#findallfunctionistosearchforthestring
headers=[header.textforheaderintable.find_all('th')]
#thesearetheheaderswegot
#checkingfortheheaders
#aswecansee18thcellthfindstheheader
headers
#creatingthevariablecalledrows
#thesquarebracketisemptywiththerowsaswehavenotpassedanythingsintherows
#thenweareexcutingoneforloopwhichworksfortheentirerowinthetable
#whichwearegoingtofind
#andbyusingthefind_all('tr'),itdefinestherowinthetable
#tristhepartofthehtmltag
#andappendtherowwherewegottheencode-
#justlikewehaveASCIIcodewehavetextencodeandmostlyweusetheencodingnamedutf8
#whateverdatawegotfromthetablewewanttodecodeit
#oncewedecodeitwewillcreateoneforloopforeachrowandwewillfindthetd
#tdisthehtmltabledata,oritisthestandardwaytodefinethedatacell
#valisthevariable.itcanbeanythinglikex,y,valanything
#bestwaytosolvethiskindofproblmsisthelistcomprehension.
#utf8istheencoding.differentencodingarethereforthedifferentwebpages
#ASCIIcode-itisdefinedasthechracterencodingformatforthetextdataincomputer
#andontheinternet
#0,1-thisisthemachinelanguage
rows=[]
forrowintable.find_all('tr'):
rows.append([val.text.encode('utf8').decode()forvalinrow.find_all('td')])
#sotherowsandcolumnswehaveextractedwewillcreatethedataframe
#dataframewillbenamedasdf1
df1=pd.DataFrame(rows,columns=headers)
#checkingtheheadofthedf1
#wearecheckingthe7rowsofthedf1
#WorldRank\n EURank\n Country\n Lifeexpectancyatbirth(years)
#\n-thisaretheheadersofthedata
df1.head(7)
#healthexpenditure
#herefirstwehavepassedtheurl'https://en.wikipedia.org/wiki/Healthcare_in_Europe'
#wearetakingtheurlofhealthcare
#thenwearecreatingthevariablecalledrandwearerequestingtogettheURL
#abdthenwehavecreatedthenewvariablecalledHEEandconvertingthedatawhich
#wegotfromtheurltothebeautifulsoup
url='https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita'
r=requests.get(url)
HEE=BeautifulSoup(r.text)
#thenweareusingthewebpagemethodandopeningtheurlbyusingtheurlopenmethod
#fromtheurllib2thatweimportedandtheopentheurltoreadthedata
#thenwehavecreatedthevariablecalledthehtmlpageandfromthewebpagemethod
#weareusingthefunctioncalledreadlinesthatmeanswearereadingthetextfromthewebpages
#thenwearegoingtostoreallthetextinthelistcalledlst
#usingtheforloopinthehtmlpagewearereadingtheentireline
#thisforloopworksfortheentirepage
#thenwehavecreatedthevariablecalledlineinthehtmlpage
#whateverlinewereadthatlinewillbeconvertedtothestringformat
#andalsothatlineswillbestripedthatmeansweareremovingtheunwantedspaces.
#thenwearepassingoneconditionthatisiftheregularexpression(re)ishavingtheline
#intheformatoftableclassexistinthetextthenwewanttoappendthatline
#lst=[]-blankbracketisusedbecausewhateverlineswillbeappendwillbestoredinthelst
webpage=urllib2.urlopen(url)
htmlpage=webpage.readlines()
lst=[]
forlineinhtmlpage:
line=str(line).rstrip()
ifre.search('tableclass',line):
lst.append(line)

#'"([^"]*)"'-thisiscalledthegrammeroftext
#'"([^"]*)"'-thisaretheexpressionusedintheregularexpressionstodefinthegrammer
#inNLPthegrammeroftextisexplained
#here1stwehavecreatedthevariablecalledxandweareusingthepreviouslydefinedlist
#thenweareprintingthex
#extr-thisisusedforextract
#thenwewanttoextractalltheregularexpressionsrebyusingthe
#findallmethodinthexorthelistwecreated
#thencreatedthetablenamedHEEandfromthere
#wearefindingthetableclassandputitintheextractlist
#soherewehaveextractedtheegularexpresionsbyusingthe
#findallmethodandputitintotheextractedlist.
#'"([^"]*)"'
#re-intheregularexpressionwearegoingtofindtheentirethings
#[]-hereitisgivingusthelist,squarebracketanditisdefiningthetableclaas
##*-starmenasentirewhichcomesunderthelist,squarebracketortableclaas
#wearesayingwhateverisinsidethetablewewanttoextractthatpart
x=lst[1]
print(x)
extr=re.findall('"([^"]*)"',x)
table=HEE.find('table',{'class',extr[0]})
#readingtheheadersandtherowsofthenamesseperately
#wehavecreatedthevariablecalledheaders
#thenwearewritingtheforloopforheader.text,header
#.textandforloopweareexecutingforheader
#basicallyheader.textistheinputfortheforloop
#thenwewillcomparethisheaderwiththecomparisionoperatorin
#inisthecomparisionoperator
#anditwillalltheth.sowhereeverwearegettingthwewillfindthat
#wearecreatingoneheaderfortheforloopinthetableand
#byusingthefindallmethod[findallisthestringmethod]and
#[thisthepartofthehtml,wecalleditasthead]thatweareextractingthatcolumns
#usingthethortheadmethod
#basicallytheadthatisthstorestheIDs,columnsnamesinthehtmlpage
#findallfunctionistosearchforthestring
headers=[header.textforheaderintable.find_all('th')]
#creatingthevariablecalledrows
#thesquarebracketisemptywiththerowsaswehavenotpassedanythingsintherows
#thenweareexcutingoneforloopwhichworksfortheentirerowinthetablewhich
#wearegoingtofind
#andbyusingthefind_all('tr'),itdefinestherowinthetable
#tristhepartofthehtmltag
#andappendtherowwherewegottheencode-
#justlikewehaveASCIIcodewehavetextencodeandmostlyweusetheencodingnamedutf8
#whateverdatawegotfromthetablewewanttodecodeit
#oncewedecodeitwewillcreateoneforloopforeachrowandwewillfindthetd
#tdisthehtmltabledata,oritisthestandardwaytodefinethedatacell
#utf8istheencoding.differentencodingarethereforthedifferentwebpages
#ASCIIcode-itisdefinedasthechracterencodingformatforthetextdataincomputer
#andontheinternet
#0,1-thisisthemachinelanguage
#valisthevariable.itcanbeanythinglikex,y,valanything
#bestwaytosolvethiskindofproblmsisthelistcomprehension.
rows=[]
forrowintable.find_all('tr'):
b'<tableclass="wikitablesortablestatic-row-numbersplainrowheaderssrn-white-background"border="1"style="t
ext-align:right;">\n'
Location 2018 2019 2020 2021
0 None None None None None
1 Australia*\n 5,194\n 5,130\n 5,627\n \n
2 Austria*\n 5,519\n 5,624\n 5,883\n 6,693\n
3 Belgium*\n 5,315\n 5,353\n 5,407\n \n
4 Canada*\n 5,308\n 5,190\n 5,828\n 5,905\n
rows.append([val.text.encode('utf8').decode()forvalinrow.find_all('td')])

#hereweareworkingontheperiviouslydefinedvariablecalledheader
#replacetheiwiththenewlineandexecutetheforloopforiintheheaders
#sobyusingthepandaslibrarycreatingthenewvariablecalleddf2
#sotherowsandcolumnswehaveextractedwewillcreatethedataframe
#dataframewillbenamedasdf2
headers=[i.replace("\n","")foriinheaders]
df2=pd.DataFrame(rows,columns=headers)
#checkingtheheadofthedf2
df2.head()
#additionalpreproccessingsteps
#preprocessingcanbedoneonboththenumericalandthetextualdata
#herewehavedefinedthefunctioncalledpreprocandwearepassingtheparametercalled
#datathatisdatainwhichwewillbeextractingtheinformationfromweb
#herewearedroopingthenavaluesandwearelookingfor
#axis=0thatmeanswearelookingfortheentirerow
#how='all'-thenthemethodwechosenthathowwearelookingfortherows
#sowearelookingfortheentirerows
#how='all'-itisthemethodpresentinthepythonlibraries.
#usingthismethodwedetermineiftherowsandthecolumnsareremovedfromthedataset
#dropnafunctionisthemethodtocreatethenullvalues
#inplace=true-itwillnotoverwritethevaluesagainandagainafterexecutingthesamecode
#nextwearelookingforthecolumnindatandinthecolumnswearelookingfothestring
#andwearereplacingthe\nthatisnewline
#againwearereplacingthenewlinebyusingthemethodcalledregex
#regexstandsforregularexpressions
#r"\s\*$"-thisiscalledasthegrammer,herewehaverelacedthegrammer
#r-thisiscalledasstringliteral.
#sstandsforstring
#regex=True-thisregularexpressionisusedtodefinetheentirethingswhicharepresent
#inthetext.
#.-thismeansanycharaceter
#*-thismeansanynumberofthis.thisisanythinglikeifwewrite*.
#likeanynumberofcharacter
#.*-thismeansarbitarystringofarbitarylength.
#^sign-itindicatesthebeginningofthestring
#dollar$sign-itindicatestheendofthestring
#\s-itindicatesthewhitespacecharacter.whitespacemeanstheblankspaceinthetext
#\-backslashisusedfordirectoryseperatormeanswheneverifwewanttoswitchfrom
#onedrivetotheanotherdrive
#/-forwardslash-thisslashisusedtoshowthelocation/pathofthefile
#r"\b[a-zA-Z]\b"-herebdefinestheboundariesthatwewanttoreplaceitbynumpynan
#r"^\s"-^beginingofthestring,smeansthewhitespace
#ifthestringstartswiththewhitespaceandweareidentifyingthisstringortherow
#thenreplaceitbythenewline
#identationisthespaceatthestartofthestatement
#thenweareusingtheapplymethodofthepandaslikeifanynumericvaluepresent
#thenwewanttoconvertthemintothetextorifthereistheerrorgeneration
#thenwewanttoignorethem
#andatendwewillusethereturnstatementtogettheoutputofthedatathatwewillpass
#anytypeoferrorscanbegenerate
defpreproc(dat):
dat.dropna(axis=0,how='all',inplace=True)
dat.columns=dat.columns.str.replace("\n","")
dat.replace(["\n"],[""],regex=True,inplace=True)
dat.replace([r"\s\*$"],[""],regex=True,inplace=True)
dat.replace([","],[""],regex=True,inplace=True)
dat.replace(r"\b[a-zA-Z]\b",np.nan,regex=True,inplace=True)
dat.replace([r"^\s"],[""],regex=True,inplace=True)
dat=dat.apply(pd.to_numeric,errors='ignore')
return(dat)
#herewehavecreatedthevariablecalleddf1andthedf2
#andwearepassingthevariablecalleddf1anddf2inthepreprocessingfunction
df1=preproc(df1)
df2=preproc(df2)
0
18
WorldRank EURank Country Lifeexpectancyatbirth(years)
WorldRankfloat64
EURankfloat64
Countryobject
Lifeexpectancyatbirth(years)float64
dtype:object
Locationobject
2018int64
2019int64
2020int64
2021float64
dtype:object
WorldRank EURank Country Lifeexpectancyatbirth(years)
1 5.0 1.0 Spain 83.4
2 6.0 2.0 Italy 83.4
3 11.0 3.0 Sweden 82.7
4 12.0 4.0 France 82.5
5 13.0 5.0 Malta 82.4
Country 2017 2018 2019 2020
1 Australia 5194 5130 5627 NaN
2 Austria 5519 5624 5883 6693.0
3 Belgium 5315 5353 5407 NaN
4 Canada 5308 5190 5828 5905.0
5 Chile 2281 2297 2413 2608.0
#hereweareprintingthesumofallthenullvaluesinthedf1andthedf2
#summethodcalculatestheelementsofeachrow
#bydefaultthesumcalculatesthenullvaluespresentinthecolumns
#butwearealsocalculatingthenullvaluespresentintherows
#sowehaveusedanothersum()also
#1stsum()isusedtocalculatethenullvaluesinthecolumnsand
#the2ndsumwillcalculatethenullvaluesinrows.thatswhywehaveusedtwosum()
print(df1.isnull().sum().sum())
print(df2.isnull().sum().sum())
#theoutputshowsthatthereisnonullbaluespresentinthedf1andthedf2
#herefromthedf1wearecheckingthenullvaluespresent
#inthedf1ifanyexistinthecolumnthatisaxis=1
#wealwaysreadthedataintheformrandcthatisintheformofrowandcolumn
df1[df1.isnull().any(axis=1)]
#theoutputshowsthereisnonullvalueavailable
#inthepandasthedropmethodtake2valueseitheritwilltakeallorany
#any-itrepresentsifanynavaluesmaybe1or2ormorepresentthendropthatroworcolumn
#ifallvaluesarenathatishavingthenullvaluesdropthatroworcolumn
#howistheparameterinthepanfdaslibraryunderthedropnafunctions
#hereinthedf1wearedroppingtheanynavaluespresentintherows
df1.dropna(axis=0,how='any',inplace=True)
#checkingthedatatypesofdf1
df1.dtypes
#checkingthedatatypesofdf2
df2.dtypes
#hereweareprintingthecolumnsnamesfordf1anddf2
df1.columns=['WorldRank','EURank','Country','Lifeexpectancyatbirth(years)']
df2.columns=['Country','2017','2018','2019','2020']
#herewearecheckingtheintialsrowsofthedf1
df1.head()
#herewearecheckingtheinitialrowsofthedf2
df2.head()
#hereweareusingthecountryastheprimarykeybecause
WorldRank EURank Country Lifeexpectancyatbirth(years) 2017 2018 2019 2020
0 5.0 1.0 Spain 83.4 3427.0 3523.0 3718.0 NaN
1 6.0 2.0 Italy 83.4 3496.0 3565.0 3747.0 4038.0
2 11.0 3.0 Sweden 82.7 5419.0 5388.0 5757.0 6262.0
3 12.0 4.0 France 82.5 5099.0 5168.0 5468.0 6115.0
4 13.0 5.0 Malta 82.4 NaN NaN NaN NaN
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
<class'pandas.core.frame.DataFrame'>
RangeIndex:150entries,0to149
Datacolumns(total6columns):
#ColumnNon-NullCountDtype
----------------------------
0Id150non-nullint64
1SepalLengthCm150non-nullfloat64
2SepalWidthCm150non-nullfloat64
3PetalLengthCm150non-nullfloat64
4PetalWidthCm150non-nullfloat64
5Species150non-nullobject
dtypes:float64(4),int64(1),object(1)
memoryusage:7.2+KB
#hereweareusingthecountryastheprimarykeybecause
#weneedthecolumncolumnwheneverweareworkingwiththemergedata
#wearetryingtomergethedatasetscalleddf1anddf2and
#byusingthecommoncolumncalledcountrywewanttoperforntheleftjoin
#howislikaajointhatwehaveusedtheparameterlikeleftjoin
#thedefaulthowisinnerandinneristhejoin
#thetypesofjoinsare;inner,left,outerandfull
pd.merge(df1,df2,how='left',on='Country').head()
#featureengineering
#themajorlibraryisthepreprocessing-itisused
#whenwewanttodesignthetrainandtestdatasethereweusethepreprocessing
#encodingmethodslikelabelencodingandtheonehotencoding
#thensklearnlibrarylikeifwewanttoimporthelogisticregressionetc...#
#Selectthecellandclickonrunicontoimportlibraries.
#importingtherequiredlibrariesthatispandas,numpyandthesklearnlibrray
#thelabelencodigandtheonehotencodingmethodsareavailable
#intheskalearnpreprocessinglibrarythatswhyweareimportingthepreprocessingfromsklearn.
#allthemachinelearningalgorithmsarepresentinthesklearnlibrary
#sklearnisthepriimarylibrarybutthemethodsaredifferent
importpandasaspd
importnumpyasnp
#Importlabelencoder
fromsklearnimportpreprocessing
#herewehavecreatedthevariablecalledirisdf
#thenbyusingthepandaslibraryweareusingthecsvfile
#thenwearecheckingtheinitialheadersoftheirisdataset
iris_df=pd.read_csv('Iris.csv')
iris_df.head()
#checkingtheinformationpresentintheirisdataset
#herethedatatypeforspeciesisconvertedtotheint64
#thatmeansourlabelencoderhadconvertedthedataofspeciesfromcategoricalvalue
#tothenumericalvalue.
iris_df.info()
#weareusingthelabelencodertoconvertthecategoricalvaluetothenumericalvalue
#herewehavemakethemethodcalledlabelencoder
#andweareimportingthismodulecalledlabelencoderfromthepreprocessinglibrary
#ofthesklearn
label_encoder=preprocessing.LabelEncoder()
array(['Iris-setosa','Iris-versicolor','Iris-virginica'],dtype=object)
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 0
1 2 4.9 3.0 1.4 0.2 0
2 3 4.7 3.2 1.3 0.2 0
3 4 4.6 3.1 1.5 0.2 0
4 5 5.0 3.6 1.4 0.2 0
<class'pandas.core.frame.DataFrame'>
RangeIndex:150entries,0to149
Datacolumns(total6columns):
#ColumnNon-NullCountDtype
----------------------------
0Id150non-nullint64
1SepalLengthCm150non-nullfloat64
2SepalWidthCm150non-nullfloat64
3PetalLengthCm150non-nullfloat64
4PetalWidthCm150non-nullfloat64
5Species150non-nullint32
dtypes:float64(4),int32(1),int64(1)
memoryusage:6.6KB
#herewearefindigtheuniquevaluesofthespecieselementpresentintheirisdataset
iris_df['Species'].unique()
#hereweareconvertingthedataofspeciesfromcategoricalvaluetothenumericalvalues
#fittransform-itisthemethodandbyusingthismethodwearetransforming
#thespeciescolumnoftheirisdataset
#hereiris_df['Species']isthevariablewehavecreatedandbyusing
#thelabelencoderwearetransformingthespeciescolumnoftheirisdataset
#thenwearecheckingtheheadoftheirisdataset
iris_df['Species']=label_encoder.fit_transform(iris_df['Species'])
#herethemeanandthevarianceiscalculatedfor
#theSepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
#hereweareobservingthechangesonthespeciescolumn
#aswehaveperformedthetransformationonthespeciescolumnonly
#andrestofthecolumnsdataaresame
#herewearehavingonly0inthespeciescolumn
#butifwewrite10or15intheheadtoviewthedatacolumnsthenwecansee0and1sboth
iris_df.head()
#checkingtheinformationoftheirisdataset
iris_df.info()
#hereweareimportingthemodulecalleddatasetsfromthesklearnlibrary
#andhereweareimportingthemodulecalledOneHotEncoderfromthepreprocessinglibrary
#underthesklearn
fromsklearnimportdatasets
fromsklearn.preprocessingimportOneHotEncoder
#herewearecreatingthecolumn
#herewehavecreatedthevariablecalledirisdataandwehaveloadedtheirisinthedatasets
#thenbyusingthepandaswehavecreatedthedataframe
#data=np.c-thispartisusedforslicing-ittransfersthesliceobjecttoconcatination
#aswehavedividedthedatasetintotwopartsthatisthedataandthetarget
#thenwehavecreatedthedataandthenwearecreatingthecloumnsinthedataby
#usingthenumpylibraryandthecoulmnsarecreatedintheirisdatasetthatisfeatures
#nameadtarget.
#thenwearecheckingthetargetvaluesasyisthetargetintheirisdataset
iris_data=dataoad_irissets.l()
iris_data=pd.DataFrame(data=np.c_[iris_data["data"],iris_data["target"]],
columns=iris_data["feature_names"]+["target"])
y=iris_data.target.values
#checkingtheinitialrowsoftheirisdataset
iris_data.head()
sepallength(cm) sepalwidth(cm) petallength(cm) petalwidth(cm) target
0 5.1 3.5 1.4 0.2 0.0
1 4.9 3.0 1.4 0.2 0.0
2 4.7 3.2 1.3 0.2 0.0
3 4.6 3.1 1.5 0.2 0.0
4 5.0 3.6 1.4 0.2 0.0
array([0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,
0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,
0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,1.,
1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,
1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,
1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,2.,2.,
2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,
2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,
2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.,2.])
[[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
[1.0.0.]
#checkingthetargetvaluesasyisthetarget
y
#firstweneedtoimporttheonehotencoderandmakeamethodforit
#herewehavecreatedthevariablecalledonehotencoder
#thenintheonehotencoderwearespecifyingthecategoriescalled
#autothatmeanscategoriesaregoingtobedeterminedautomatically
#thenwearereshapingtheyarraythatisthetargetarraythatistobetransformed
#andtobefitinthey
#reshapeandtoarrayaretwomethodsofthenumpy
#herewearereshapingthearraytothefor,atof-1to+1
#andthenweareprintingtheythatistargetintheformofarray
#firstwewillreshapethearrayandthenwewillprintthearray
#-1isfortheunknowndimenssion,reshapefunctioncalculatestheproperdimenssion
#ifanyunknown/improperdimenssionisthere
onehotencoder=OneHotEncoder(categories='auto')
y=onehotencoder.fit_transform(y.reshape(-1,1))
print(y.toarray())
[1.0.0.]
[1.0.0.]
[1.0.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.1.0.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]
[0.0.1.]]
0.0 1.0 2.0
0 1 0 0
1 1 0 0
2 1 0 0
3 1 0 0
4 1 0 0
Rank Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
0 1 WiiSports Wii 2006.0 Sports Nintendo 41.49 29.02 3.77 8.46 82.74
1 2 SuperMarioBros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24
2 3 MarioKartWii Wii 2008.0 Racing Nintendo 15.85 12.88 3.79 3.31 35.82
3 4 WiiSportsResort Wii 2009.0 Sports Nintendo 15.75 11.01 3.28 2.96 33.00
4 5 PokemonRed/PokemonBlue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37
Index(['Rank','Name','Platform','Year','Genre','Publisher','NA_Sales',
'EU_Sales','JP_Sales','Other_Sales','Global_Sales'],
dtype='object')
(16598,11)
#thetargetvariablesarecovertedintothenumericalvalues
#thenwewillcreatethedummies
#get.dummiesisthefunctionnamepresentinthepandaslibrary
#herebyusingthepandaslibrarywearecreatingthedummiesforthetargetvariable
#intheirisdatasetandwearecheckingtheinitialsrowsandcolumnsofit.
pd.get_dummies(iris_data.target).head()
#fromthesklearn.feature_extractionmethodunderthesklearnlibraryweareimportingFeatureHasher
fromsklearn.feature_extractionimportFeatureHasher
#herewehavecreatedthethevariablecalledgamedfandbyusingthe
#pandaslibrarywearereadingthecsvfile
#herewehaveusedtheencodingtypeasUTF-8asthefiletypeconversionofthecsvfile
#thenweraecheckingtheinitialst=rowsandcolumnsofthevariablecreatedcalledgamedf
game_df=pd.read_csv("vgsales.csv",encoding="utf-8")
game_df.head()
#herewearecheckingthecolumnsofthevariablecalledgamedf
game_df.columns
#herewearecheckingtheshapeofthegamedf
game_df.shape
#hereweaveselectedfewspecificcolumns
#ilocisthemethodthroughwhichwecanextractthespecificrowsandcolumns
#fromthespecificdatasets
#hereinthevariablecreatedcalledgamedfwehaveselectedfewspecificcolumns
#fromthedatasetsandilocisusedtoextracttherowsandcolumnsfromthedatasets.
#herewehaveilocrangeas1:7thatmeanspythonwilldisplays6values
game_df[['Name','Platform','Year','Genre','Publisher']].iloc[1:7]
Name Platform Year Genre Publisher
1 SuperMarioBros. NES 1985.0 Platform Nintendo
2 MarioKartWii Wii 2008.0 Racing Nintendo
3 WiiSportsResort Wii 2009.0 Sports Nintendo
4 PokemonRed/PokemonBlue GB 1996.0 Role-Playing Nintendo
5 Tetris GB 1989.0 Puzzle Nintendo
6 NewSuperMarioBros. DS 2006.0 Platform Nintendo
Totalgamegeneres:12
['Action''Adventure''Fighting''Misc''Platform''Puzzle''Racing'
'Role-Playing''Shooter''Simulation''Sports''Strategy']
Name Genre 0 1 2 3 4 5
0 WiiSports Sports -2.0 2.0 0.0 -2.0 0.0 0.0
1 SuperMarioBros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0
2 MarioKartWii Racing -1.0 0.0 0.0 0.0 0.0 -1.0
3 WiiSportsResort Sports -2.0 2.0 0.0 -2.0 0.0 0.0
4 PokemonRed/PokemonBlue Role-Playing -1.0 1.0 2.0 0.0 1.0 -1.0
#herewehavecreatedthevariablecalledu_generesthenbyusing
#thenumpylibrarywearefindingtheuniquevaluesforgenreinthegamedf
#thenweareprintingtheTotalgamegeneresbyusing
#thelengthfunctioninthevariablecalledu_generes
#thenweareprintingtheu_generes
u_generes=np.unique(game_df["Genre"])
print("Totalgamegeneres:",len(u_generes))
print(u_generes)
#intheoutputwearehavingtotal12genresinthedatasets
#herewehavecreatedthevariablecalledfhforFeatureHasher
#andherewearecreatingthe6featuresandtheinputtypeisstringbasedinput
#thatmeanswhateverinputwearefeedingtothethisfeaturehashermethodthatwillbethestring
#typemethods.
#thenwehavecreatedthevariablecalledhashedfeatureandbyusing
#thefittranformmethodwearetransformingthestring
#thenweareconvertingittothearray
#thenwehavecreatedthenewdataframeandthenbyusingthepandaslibrary
#weareconcatinatingthatmeansarecreatingthecolumnsforthenameandgenreintothepandas
#dataframethatishashedfeatureinthecolumnsthatisaxis=1
#thenwearecheckingtheinitialrowsandcolumnsofthenewgamedataset
fh=FeatureHasher(n_features=6,input_type='string')
hashed_features=fh.fit_transform(game_df["Genre"])
hashed_features=hashed_features.toarray()
new_game_df=pd.concat([game_df[['Name','Genre']],pd.DataFrame(hashed_features)],axis=1)
new_game_df.head()
#importingtherequiredlibraries
#seabornandmatplotlibisusedforvisualization
importnumpyasnp
importpandasaspd
importseabornassns
importmatplotlib.pyplotasplt
%matplotlibinline
#herewehaveimportedthewarningslibrary
#thenfromwarningslibrarywehaveimportedthemodulecalledfilterwarnings("ignore")
#importwarningsistheentirelibraryandfromthatlibraryweareimportingthefilterwarnings
#andwehaveusedthefunctioncalledignorethatmeanswhatever
#wewillgetthewarninginthecodethatshouldbeignoredandthatwarningsshould
#nothaveanyaffectontheoutput
importwarnings
warnings.filterwarnings('ignore')
#herewehavecreatedthevariablecalleddf
#byusingthepandaslibrarywearereadingthecsvfile
#andalsocheckingtheinitialrowsandcolumnsofthedataset
df=pd.read_csv("HR-Employee-Attrition.csv")
df.head()
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ...
0 41 Yes Travel_Rarely 1102 Sales 1 2 LifeSciences 1 1 ...
1 49 No Travel_Frequently 279
Research&
Development
8 1 LifeSciences 1 2 ...
2 37 Yes Travel_Rarely 1373
Research&
Development
2 2 Other 1 4 ...
3 33 No Travel_Frequently 1392
Research&
Development
3 4 LifeSciences 1 5 ...
4 27 No Travel_Rarely 591
Research&
Development
2 1 Medical 1 7 ...
5rows×35columns
Index(['Age','Attrition','BusinessTravel','DailyRate','Department',
'DistanceFromHome','Education','EducationField','EmployeeCount',
'EmployeeNumber','EnvironmentSatisfaction','Gender','HourlyRate',
'JobInvolvement','JobLevel','JobRole','JobSatisfaction',
'MaritalStatus','MonthlyIncome','MonthlyRate','NumCompaniesWorked',
'Over18','OverTime','PercentSalaryHike','PerformanceRating',
'RelationshipSatisfaction','StandardHours','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance',
'YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
(1470,35)
Age0
Attrition0
BusinessTravel0
DailyRate0
Department0
DistanceFromHome0
Education0
EducationField0
EmployeeCount0
EmployeeNumber0
EnvironmentSatisfaction0
Gender0
HourlyRate0
JobInvolvement0
JobLevel0
JobRole0
JobSatisfaction0
MaritalStatus0
MonthlyIncome0
MonthlyRate0
NumCompaniesWorked0
Over180
OverTime0
PercentSalaryHike0
PerformanceRating0
RelationshipSatisfaction0
StandardHours0
StockOptionLevel0
TotalWorkingYears0
TrainingTimesLastYear0
WorkLifeBalance0
YearsAtCompany0
YearsInCurrentRole0
YearsSinceLastPromotion0
YearsWithCurrManager0
dtype:int64
#checkingthecolumnsofthedf
df.columns
#cehckingtheshapeofthedf
df.shape
#herewearecheckingthesumofnullvaluesinthedf
#inthoutputwedonthavethenullvalues
df.isna().sum()
#herewearefindingthecountofeachvaluesinthecolumns
#foreachcolumninourdatasetwearecreatingoneforloopandiisthereferencevariable
#thenweareprintingthevalueofithatmeansvalueofeachdatapoints
#wehaveprovidedthesliceofdatapoints
#andwearereadingivalueforeachcolumn
#:thiscoloniscalledasthesliceoperatoranditrepresentsthecountvalues
Age:3578
3477
3669
3169
2968
3261
3060
3358
3858
4057
3750
2748
2848
4246
3942
4541
4140
2639
4433
4633
4332
5030
2526
2426
4924
4724
5522
5119
5319
4819
5418
5218
2216
5614
2314
5814
2113
2011
5910
199
188
605
574
Name:Age,dtype:int64
_____________________________________
___
_____________________________________
___
Attrition:No1233
Yes237
Name:Attrition,dtype:int64
_____________________________________
___
_____________________________________
___
BusinessTravel:Travel_Rarely1043
Travel_Frequently277
Non-Travel150
Name:BusinessTravel,dtype:int64
_____________________________________
___
_____________________________________
___
DailyRate:6916
4085
5305
13295
10825
..
6501
2791
3161
3141
6281
Name:DailyRate,Length:886,dtype:int64
_____________________________________
___
_____________________________________
___
Department:Research&Development961
Sales446
#andweareprinting-dashby40times
#andwehaveused2times40becausetoseperatetheoutputofthedataasitisintheforloop
foriindf.columns:
print(i,":",df[i].value_counts())
print("_"*40)
print("_"*40)
HumanResources63
Name:Department,dtype:int64
_____________________________________
___
_____________________________________
___
DistanceFromHome:2211
1208
1086
985
384
784
880
565
464
659
1632
1129
2428
2327
2927
1526
1826
2625
2525
2025
2823
1922
1421
1220
1720
2219
1319
2118
2712
Name:DistanceFromHome,dtype:int64
_____________________________________
___
_____________________________________
___
Education:3572
4398
2282
1170
548
Name:Education,dtype:int64
_____________________________________
___
_____________________________________
___
EducationField:LifeSciences606
Medical464
Marketing159
TechnicalDegree132
Other82
HumanResources27
Name:EducationField,dtype:int64
_____________________________________
___
_____________________________________
___
EmployeeCount:11470
Name:EmployeeCount,dtype:int64
_____________________________________
___
_____________________________________
___
EmployeeNumber:11
13911
13891
13871
13831
..
6591
6571
6561
6551
20681
Name:EmployeeNumber,Length:1470,dtype:int64
_____________________________________
___
_____________________________________
___
EnvironmentSatisfaction:3453
4446
2287
1284
Name:EnvironmentSatisfaction,dtype:int64
_____________________________________
___
_____________________________________
___
Gender:Male882
Female588
Name:Gender,dtype:int64
_____________________________________
___
_____________________________________
___
HourlyRate:6629
9828
4228
4828
8428
..
3115
5314
6814
3813
3412
Name:HourlyRate,Length:71,dtype:int64
_____________________________________
___
_____________________________________
___
JobInvolvement:3868
2375
4144
183
Name:JobInvolvement,dtype:int64
_____________________________________
___
_____________________________________
___
JobLevel:1543
2534
3218
4106
569
Name:JobLevel,dtype:int64
_____________________________________
___
_____________________________________
___
JobRole:SalesExecutive326
ResearchScientist292
LaboratoryTechnician259
ManufacturingDirector145
HealthcareRepresentative131
Manager102
SalesRepresentative83
ResearchDirector80
HumanResources52
Name:JobRole,dtype:int64
_____________________________________
___
_____________________________________
___
JobSatisfaction:4459
3442
1289
2280
Name:JobSatisfaction,dtype:int64
_____________________________________
___
_____________________________________
___
MaritalStatus:Married673
Single470
Divorced327
Name:MaritalStatus,dtype:int64
_____________________________________
___
_____________________________________
___
MonthlyIncome:23424
61423
27413
25593
26103
..
71041
27731
195131
34471
44041
Name:MonthlyIncome,Length:1349,dtype:int64
_____________________________________
___
_____________________________________
___
MonthlyRate:42233
91503
95582
128582
220742
..
145611
26711
57181
117571
102281
Name:MonthlyRate,Length:1427,dtype:int64
_____________________________________
___
_____________________________________
___
NumCompaniesWorked:1521
0197
3159
2146
4139
774
670
563
952
849
Name:NumCompaniesWorked,dtype:int64
_____________________________________
___
_____________________________________
___
Over18:Y1470
Name:Over18,dtype:int64
_____________________________________
___
_____________________________________
___
OverTime:No1054
Yes416
Name:OverTime,dtype:int64
_____________________________________
___
_____________________________________
___
PercentSalaryHike:11210
13209
14201
12198
15101
1889
1782
1678
1976
2256
2055
2148
2328
2421
2518
Name:PercentSalaryHike,dtype:int64
_____________________________________
___
_____________________________________
___
PerformanceRating:31244
4226
Name:PerformanceRating,dtype:int64
_____________________________________
___
_____________________________________
___
RelationshipSatisfaction:3459
4432
2303
1276
Name:RelationshipSatisfaction,dtype:int64
_____________________________________
___
_____________________________________
___
StandardHours:801470
Name:StandardHours,dtype:int64
_____________________________________
___
_____________________________________
___
StockOptionLevel:0631
1596
2158
385
Name:StockOptionLevel,dtype:int64
_____________________________________
___
_____________________________________
___
TotalWorkingYears:10202
6125
8103
996
588
781
181
463
1248
342
1540
1637
1136
1336
2134
1733
231
1431
2030
1827
1922
2322
2221
2418
2514
2814
2614
011
2910
319
329
307
337
277
366
345
374
353
402
381
Name:TotalWorkingYears,dtype:int64
_____________________________________
___
_____________________________________
___
TrainingTimesLastYear:2547
3491
4123
5119
171
665
054
Name:TrainingTimesLastYear,dtype:int64
_____________________________________
___
_____________________________________
___
WorkLifeBalance:3893
2344
4153
180
Name:WorkLifeBalance,dtype:int64
_____________________________________
___
_____________________________________
___
YearsAtCompany:5196
1171
3128
2127
10120
4110
790
982
880
676
044
1132
2027
1324
1520
1418
2215
1214
2114
1813
1612
1911
179
246
335
254
264
313
323
272
362
292
232
371
401
341
301
Name:YearsAtCompany,dtype:int64
_____________________________________
___
_____________________________________
___
YearsInCurrentRole:2372
0244
7222
3135
4104
889
967
157
637
536
1029
1122
1314
1411
1210
158
167
174
182
Name:YearsInCurrentRole,dtype:int64
_____________________________________
___
_____________________________________
___
YearsSinceLastPromotion:0581
1357
2159
776
461
352
545
632
1124
818
917
1513
1310
1210
149
106
Name:YearsSinceLastPromotion,dtype:int64
_____________________________________
___
_____________________________________
___
YearsWithCurrManager:2344
0263
7216
3142
8107
498
176
964
531
629
1027
1122
1218
1314
177
155
145
162
Name:YearsWithCurrManager,dtype:int64
_____________________________________
___
_____________________________________
___
Age:[414937332732593038363529313428225324214244463943
50264855455623514054582025195752471860]
_____________________________________
___
_____________________________________
___
Attrition:['Yes''No']
_____________________________________
___
_____________________________________
___
BusinessTravel:['Travel_Rarely''Travel_Frequently''Non-Travel']
_____________________________________
___
_____________________________________
___
DailyRate:[11022791373139259110051324135821612998091536701346
103138933411231219371673121841939169912821125691
4777059241459125895813127386989085211414641240
135799472113601065408121112296261434148810971443515
853114265511154276539891435122383611951339664318
12251328108254813274677619339794512141115731153
14005414322886695306321334638109312171353120682
48980782787166510401420240128053414566581421127
10311189135414679223941312750441684249841147528
59447095754280213551150132995910331316364438689
2011427857933118113956621436194967149611691145630
303125644014501452465702115760214801268713134526
13801406291356328108493169210693138945561344290
138926126147210028789051180121113663511516441045
829124214698969921052114713966631199793191413944
1323532818854103477114011431976141113002521327832
10171199504505916124768526914168333071311128488
529121014636751385140345266611582289967281315322
14797971070442496137292068814491117636506444950
88955523012325661302812147621811321105906849390
106124919255311718510917231220588137710181275798
672116250814825592109281001549112473857011301192
34314412961309483810544106213196411332756845593
117135092111441431046575156128375530411783291362
137120225316411077591305982821138148014738911063
645149031742214851368144829613981349986109911161499
983100913031274127758741312769881474163267619302
44382856142623213061094509775195258471799956
53514954461245703823124662212874482541365538525
55878236212361112204134360412166461602381397306
99148211769131076727885243806817141012071442693
9295626085809701179294314316654168381217501
6501418049751090346430268167621527883954310
71972571565711461823765713847911111124310921325
8052131186761252286125893210418597209461184436
58976088713186251805861012661930342123012711278
60713030058314181269379395126512223418681231102
8811383107537410867811775001425145461710859951122
61854646211981272154113711881881333867263938129
6164981404105328913762311528829031379335722461
974112684011342489559391391120628714411091066277
466105526513524710352661451038123411091089788124
6601186146479641576910031366330149212043091330469
69712621050770406203130898443979314511182174490
71843377360387436719948164713849028198621457
9779421402142113619172001501796961163631071465
458121211039661010326109896911676941320536373599
2511312371429648735531429968879640412848360
1138325132229910306345242561060935495282206943
523507601855129114051369999120228540473614981200
1439499205683146294965233214753379711174667560
##herewearefindingtheuniquevaluesofeachvalues/elementinthecolumns
#foreachcolumninourdatasetwearecreatingoneforloopandiisthereferencevariable
#thenweareprintingthevalueofithatmeansvalueofeachdatapoints
#wehaveprovidedthesliceofdatapoints
#andwearereadingivalueforeachcolumn
#:thiscoloniscalledasthesliceoperatoranditrepresentsthecountvalues
#andweareprinting-dashby40times
#andwehaveused2times40becausetoseperatetheoutputofthedataasitisintheforloop
foriindf.columns:
print(i,":",df[i].unique())
print("_"*40)
print("_"*40)
1723831255359401377592144512218669814471326748
990405115790830119314234672714101083516224136
10293331440674134289882449259874088812881041108
47913514744378841370264105956345713132411015336
13871702086717117371470365763567486772301311
58488039214870812597863706781465819181238585
7415523697175439647926111768976001054428181
21110795903059534781375244511129419673412391253
11281336234766261119443157214221297574355207706
280726414352122445912541131835117212667832191213
1096125113946051064133793715775411681551444189911
132111545576428011611382103710558270434511201378
4686131023628]
_____________________________________
___
_____________________________________
___
Department:['Sales''Research&Development''HumanResources']
_____________________________________
___
_____________________________________
___
DistanceFromHome:[182324232716152619215119761042512182922
1420281713]
_____________________________________
___
_____________________________________
___
Education:[21435]
_____________________________________
___
_____________________________________
___
EducationField:['LifeSciences''Other''Medical''Marketing''TechnicalDegree'
'HumanResources']
_____________________________________
___
_____________________________________
___
EmployeeCount:[1]
_____________________________________
___
_____________________________________
___
EmployeeNumber:[124...206420652068]
_____________________________________
___
_____________________________________
___
EnvironmentSatisfaction:[2341]
_____________________________________
___
_____________________________________
___
Gender:['Female''Male']
_____________________________________
___
_____________________________________
___
HourlyRate:[946192564079816744844931935051809678
458253835872484241869775333773983647
713043995995577687665532527062646360
10046397735915434906588858968697438]
_____________________________________
___
_____________________________________
___
JobInvolvement:[3241]
_____________________________________
___
_____________________________________
___
JobLevel:[21345]
_____________________________________
___
_____________________________________
___
JobRole:['SalesExecutive''ResearchScientist''LaboratoryTechnician'
'ManufacturingDirector''HealthcareRepresentative''Manager'
'SalesRepresentative''ResearchDirector''HumanResources']
_____________________________________
___
_____________________________________
___
JobSatisfaction:[4231]
_____________________________________
___
_____________________________________
___
MaritalStatus:['Single''Married''Divorced']
_____________________________________
___
_____________________________________
___
MonthlyIncome:[599351302090...999153904404]
_____________________________________
___
_____________________________________
___
MonthlyRate:[19479249072396...51741324310228]
_____________________________________
___
_____________________________________
___
NumCompaniesWorked:[8169045273]
_____________________________________
___
_____________________________________
___
Over18:['Y']
_____________________________________
___
_____________________________________
___
OverTime:['Yes''No']
_____________________________________
___
_____________________________________
___
PercentSalaryHike:[112315121320222117141618192425]
_____________________________________
___
_____________________________________
___
PerformanceRating:[34]
_____________________________________
___
_____________________________________
___
RelationshipSatisfaction:[1423]
_____________________________________
___
_____________________________________
___
StandardHours:[80]
_____________________________________
___
_____________________________________
___
StockOptionLevel:[0132]
_____________________________________
___
_____________________________________
___
TotalWorkingYears:[81076121175331130262422919223141542928
21252011163738304018363432333527]
_____________________________________
___
_____________________________________
___
TrainingTimesLastYear:[0325146]
_____________________________________
___
_____________________________________
___
WorkLifeBalance:[1324]
_____________________________________
___
_____________________________________
___
YearsAtCompany:[61008271954253121422152721171113371620
40243319361829313234263023]
_____________________________________
___
_____________________________________
___
YearsInCurrentRole:[4702598361311514161110121817]
_____________________________________
___
_____________________________________
___
YearsSinceLastPromotion:[0132748651591312101114]
_____________________________________
___
_____________________________________
___
YearsWithCurrManager:[57026831117141291015131614]
_____________________________________
___
_____________________________________
___
(1470,31)
Index(['Age','Attrition','BusinessTravel','DailyRate','Department',
'DistanceFromHome','Education','EducationField',
'EnvironmentSatisfaction','Gender','HourlyRate','JobInvolvement',
'JobLevel','JobRole','JobSatisfaction','MaritalStatus',
'MonthlyIncome','MonthlyRate','NumCompaniesWorked','OverTime',
'PercentSalaryHike','PerformanceRating','RelationshipSatisfaction',
'StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear',
'WorkLifeBalance','YearsAtCompany','YearsInCurrentRole',
'YearsSinceLastPromotion','YearsWithCurrManager'],
dtype='object')
RelationshipSatisfaction JobSatisfaction EnvironmentSatisfaction JobInvolvement
0 1 4 2 3
1 4 2 3 2
2 2 3 4 2
3 3 3 4 3
4 4 2 1 3
... ... ... ... ...
1465 3 4 3 4
1466 1 1 4 2
1467 2 2 2 4
1468 4 2 4 2
1469 1 3 2 4
1470rows×4columns
#dropfunctionisusedtodropthecolumnwhichisnotimportantinthedatasets
#herewearedroppingfewcolumnsfromthedatasetand
#axis=1meanswearedroppingtheelementsfromthecolumns
df=df.drop(['Over18','EmployeeNumber','EmployeeCount','StandardHours'],axis=1)
#checkingtheshapeofthedataset
df.shape
#checkingthecolumnsofthedatasets
df.columns
#herewehavecreatedthedataframeforfewcolumns
df[['RelationshipSatisfaction','JobSatisfaction','EnvironmentSatisfaction','JobInvolvement']]
#herewearecreatedthetotalstatisfactionmeanforthecolumns
#andwehavedefinedthevariablecalledsatifthatmeansstatisfactionif
#andbasedontheconditionifwehavetheTotalSatisfaction_meangreater
#than2.35thenitwillreturn1orelseitwillreturn0
#defmeanwillusedwhenweareworkingononecolumnonlybecausemeanworksononlyonecolumn
#lambdaisthefunctiontocreatethefunctionanousmously,wedonthavetogivenaynametoit
#thenwehavecreatedthedf['Satif']asthecolumnandbyusingtheapplymethodoflambda
#weareapplyingitonthedf['Satif']onthecolumnthatisaxis=1
#andwearecheckingforthedf['Satif']
df['TotalSatisfaction_mean']=(df['RelationshipSatisfaction']+df['EnvironmentSatisfaction']
+df['JobSatisfaction']+df['JobInvolvement']+df['WorkLifeBalance'])
/5
defSatif(df):
ifdf['TotalSatisfaction_mean']>2.35:
return1
else:
return0


df['Satif']=df.apply(lambdadf:Satif(df),axis=1)
df['Satif']





#checkingtheshapeofthedf
(1470,32)
(1470,33)
01
10
21
30
41
..
14650
14660
14670
14680
14690
Name:MovingPeople,Length:1470,dtype:int64
(1470,34)
00
10
20
30
40
..
14651
14660
14670
14680
14690
Name:LongDis,Length:1470,dtype:int64
#checkingtheshapeofthedf
df.shape
#herewearecreatingthevariablecalledthejobsatisfactionmeanunderwhich
#wehavethetwoelementanddividedby2
#herewehaveapplyingtheformulaofmeanasthesumofelementsbythetotalnumberofelements
df['JobSatisf_mean']=(df['JobSatisfaction']+df['JobInvolvement'])/2
#checkingtheshapeofthedf
df.shape
#methodsalwayscomesinroundbracket
#herewehavedefinedthefunctioncalledmovingpeople
#anddfistheparameterwearepassinghereandwecanpasstheentiredatasetshere
#thenwearedefiningtheifconditionthatmeansifsomeonehasworkedformorethan
#4companiesthenreturn1orelsereturn0
#applymethodwillallowsustopassthelogicofthefunctionandapplythefunctiontoeachvalue
#ofthepandasseries
#thenwehavecreatedthedf['MovingPeople']asthecolumnandbyusingtheapplymethodoflambda
#weareapplyingitonthedf['MovingPeople']onthecolumnthatisaxis=1
#andwearecheckingforthedf['MovingPeople']
defMovingPeople(df):
ifdf['NumCompaniesWorked']>4:
return1
else:
return0
df['MovingPeople']=df.apply(lambdadf:MovingPeople(df),axis=1)
df['MovingPeople']
#checkingfortheshapeofthedf
df.shape
#herewearecreatingthecolumnfordistancefromhomethatisLongDis(df)
#andwehavedefinedthefunctioncalleddef
#thenwehavedefinedtheifconditionthatisifthedistancefromhomeisgreaterthan11
#thenreturn1orelsereturn0
##thenwehavecreatedtheasthecolumndf['LongDis']andbyusingtheapplymethodoflambda
#weareapplyingitonthedf['LongDis']onthecolumnthatisaxis=1
#andwearecheckingforthedf['LongDis']
defLongDis(df):
ifdf['DistanceFromHome']>11:
return1
else:
return0
df['LongDis']=df.apply(lambdadf:LongDis(df),axis=1)
df['LongDis']
#herewearecreatingthecolumnformiddletrainingandwehavedefinedthefunctioncalleddef
#thenwehavedefinedtheifconditionthatisifthetrainingtimelastyearisgreater
#thanorequalto3andthetrainingtimelastyearislessthanorequalto6then
02.333333
114.500000
22.428571
36.500000
40.700000
...
14653.200000
14663.800000
14673.500000
14689.666667
14694.666667
Name:Time_in_each_comp,Length:1470,dtype:float64
(1470,37)
Category:['Attrition','BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus'
,'OverTime']
Numeric:['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction','HourlyRate','JobI
nvolvement','JobLevel','JobSatisfaction','MonthlyIncome','MonthlyRate','NumCompaniesWorked','PercentSalar
yHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TotalWorkingYears','TrainingTime
sLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCur
rManager','TotalSatisfaction_mean','JobSatisf_mean','MovingPeople','LongDis','MiddleTraining','Time_in_ea
ch_comp']
(1470,28)
#return1orelsereturn0
#herewiththedifferentdifferentconditionswehavecreatedthenewcolumns
#thenwehavecreatedtheasthecolumnMiddleTrainingandbyusingtheapplymethodoflambda
#weareapplyingitontheMiddleTrainingonthecolumnthatisaxis=1
#andexecutingthecode
defMiddleTraining(df):
ifdf['TrainingTimesLastYear']>=3anddf['TrainingTimesLastYear']<=6:
return1
else:
return0
df['MiddleTraining']=df.apply(lambdadf:MiddleTraining(df),axis=1)
#herewearecreatingthecolumncalleddf['Time_in_each_comp']
#andsubstracting20fromageanddivisingitbytheNumCompaniesWorkedplus1
#toreadtheentirevaluesweareusingtheplus1
#andthenwearecheckingforthetimeineachcompany
df['Time_in_each_comp']=(df['Age']-20)/((df)['NumCompaniesWorked']+1)
df['Time_in_each_comp']
#checkingtheshapeofthedf
df.shape
#herewearefindingthenumericandthecategoricaldataframes
#dtypesistoselectthedatatypesfromthedataframe
#np.numberisthemethodforselectingonlynumbers
#hereinthenumericdataframeswehaveselectedthedatatypesthatisdtypestoincludethe
#numberssowehaveusedincludenp.number
#theninthecategoricaldataframewehaveselectedthedatatypesthatisdtypestoexclude
#thenumbersowehaveusedexcludenp.number
#wehaveusedexcludenp.numberasitwillexcludethenumbersandleftwiththecategoricalvalues
numeric_df=df.select_dtypes(include=[np.number])
categoric_df=df.select_dtypes(exclude=[np.number])
#herewearecreatingthelistofthenumericalcolumnsandthecategoricalcolumns
#inthenumericalcolumnswehavecreatedthecolumnsfornumbers
#whichwillbeconvertedtothelist
#inthecategoricalcolumnswehavecreatedthecolumnsforcategorieswhich
#willbeconvertedtothelist
#thenwehaveprintedthecategoricalcolumnsforcategoriesandthenumericalcolumnsfornumbers
numericcol=numeric_df.columns.tolist()
categorycol=categoric_df.columns.tolist()
print("Category:",categorycol)
print("\nNumeric:",numericcol)
#herewehavedroppedsomeofthecolumnsfromthedffromaxis=1thatiscolumns
df=df.drop(['DailyRate','DistanceFromHome','EnvironmentSatisfaction',
'HourlyRate','JobInvolvement',
'JobSatisfaction','NumCompaniesWorked',
'RelationshipSatisfaction','TrainingTimesLastYear'],axis=1)
#checkingtheshapeofthedf
df.shape
#herewehavecreatedthevariablecalleddata
Index(['Age','Education','JobLevel','MonthlyIncome','MonthlyRate',
'PercentSalaryHike','PerformanceRating','StockOptionLevel',
'TotalWorkingYears','WorkLifeBalance','YearsAtCompany',
'YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager',
'TotalSatisfaction_mean','JobSatisf_mean','MovingPeople','LongDis',
'MiddleTraining','Time_in_each_comp','Attrition_Yes',
'BusinessTravel_Travel_Frequently','BusinessTravel_Travel_Rarely',
'Department_Research&Development','Department_Sales',
'EducationField_LifeSciences','EducationField_Marketing',
'EducationField_Medical','EducationField_Other',
'EducationField_TechnicalDegree','Gender_Male',
'JobRole_HumanResources','JobRole_LaboratoryTechnician',
'JobRole_Manager','JobRole_ManufacturingDirector',
'JobRole_ResearchDirector','JobRole_ResearchScientist',
'JobRole_SalesExecutive','JobRole_SalesRepresentative',
'MaritalStatus_Married','MaritalStatus_Single','OverTime_Yes'],
dtype='object')
(1470,42)
Age Education JobLevel MonthlyIncome MonthlyRate PercentSalaryHike PerformanceRating StockOptionLevel TotalWorkingYears WorkLifeBalance
0 41 2 2 5993 19479 11 3 0 8
1 49 1 2 5130 24907 23 4 1 10
2 37 2 1 2090 2396 15 3 0 7
3 33 4 1 2909 23159 11 3 0 8
4 27 1 1 3468 16632 12 3 1 6
5rows×42columns
#herewehavecreatedthevariablecalleddata
#thenbyusingthepandaslibrarywearecreatingthedummiesinthedfforthecolumns
#selectedisthecategoricalcolumns
#anddropfirst=trueisusedtodropthenumericalvalues
#thenwehaveprintedthecolumnsofthedata
#andalsocheckedtheshapeofthedata
data=pd.get_dummies(df,columns=categorycol,drop_first=True)
print(data.columns)
print(data.shape)
#herewehavecheckedtheinitialrowsandcolumnsofthedata
data.head()
#EDA-exploratorydataanalysis
#wearedroppingtheelementsinthedatasetiftheyaretheirrelevantcolumnsofthedataset
#thenweusethedropfunction
#removingtheunneccessaryinformationfromthedatasetiscalledthedataclaeningprocess
#checkingthedatatypesisnottheneccessaryprocess
#inthesupervisedlearningweareusingthetrainandtestdataandiintheunsupervisedlearning
#weareusingthevariousalgorithms.
#hereweareimportingtherequiredlibraries
importnumpyasnp
importpandasaspd
importmatplotlib.pyplotasplt
%matplotlibinline
#herewehavecreatedthevariablecalleddataframe.
#byusingthepandaslibrarywearereadingthecsvfile
dataframe=pd.read_csv('housing_data.csv')
#herewearecreatingthe1stcolumnasanindex
#thatmeanswehavecreatedtheindexfromthecolumn0
dataframe=('index_col=0')
#herewehavecheckedheinitialrowsandcolumnsofthedataframe
dataframe.head(5)
Unnamed:
0
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature
0 0 SC60 RL 65.0 8450 Pave None Reg Lvl AllPub ... 0 No No
1 1 SC20 RL 80.0 9600 Pave None Reg Lvl AllPub ... 0 No No
2 2 SC60 RL 68.0 11250 Pave None IR1 Lvl AllPub ... 0 No No
3 3 SC70 RL 60.0 9550 Pave None IR1 Lvl AllPub ... 0 No No
4 4 SC60 RL 84.0 14260 Pave None IR1 Lvl AllPub ... 0 No No
5rows×81columns
Unnamed:
0
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature
1455 1455 SC60 RL 62.0 7917 Pave None Reg Lvl AllPub ... 0 No No
1456 1456 SC20 RL 85.0 13175 Pave None Reg Lvl AllPub ... 0 No MnPrv
1457 1457 SC70 RL 66.0 9042 Pave None Reg Lvl AllPub ... 0 No GdPrv
1458 1458 SC20 RL 68.0 9717 Pave None Reg Lvl AllPub ... 0 No No
1459 1459 SC20 RL 75.0 9937 Pave None Reg Lvl AllPub ... 0 No No
5rows×81columns
(1460,81)
#herewehavecheckedthelastrowsandcolumnsofthedataset
#soherewehavecheckedthelast5rowsandcolumnsofthedataset
dataframe.tail(5)
#herewehavecheckedtheshapeofthedataframe
dataframe.shape
#herewehavecheckedtheinformationpresnetinthedataframe
dataframe.info()
<class'pandas.core.frame.DataFrame'>
RangeIndex:1460entries,0to1459
Datacolumns(total81columns):
#ColumnNon-NullCountDtype
----------------------------
0Unnamed:01460non-nullint64
1MSSubClass1460non-nullobject
2MSZoning1460non-nullobject
3LotFrontage1460non-nullfloat64
4LotArea1460non-nullint64
5Street1460non-nullobject
6Alley1460non-nullobject
7LotShape1460non-nullobject
8LandContour1460non-nullobject
9Utilities1460non-nullobject
10LotConfig1460non-nullobject
11LandSlope1460non-nullobject
12Neighborhood1460non-nullobject
13Condition11460non-nullobject
14Condition21460non-nullobject
15BldgType1460non-nullobject
16HouseStyle1460non-nullobject
17OverallQual1460non-nullint64
18OverallCond1460non-nullint64
19YearBuilt1460non-nullint64
20YearRemodAdd1460non-nullint64
21RoofStyle1460non-nullobject
22RoofMatl1460non-nullobject
23Exterior1st1460non-nullobject
24Exterior2nd1460non-nullobject
25MasVnrType1460non-nullobject
26MasVnrArea1460non-nullfloat64
27ExterQual1460non-nullobject
28ExterCond1460non-nullobject
29Foundation1460non-nullobject
30BsmtQual1460non-nullobject
31BsmtCond1460non-nullobject
32BsmtExposure1460non-nullobject
33BsmtFinType11460non-nullobject
34BsmtFinSF11460non-nullint64
35BsmtFinType21460non-nullobject
36BsmtFinSF21460non-nullint64
37BsmtUnfSF1460non-nullint64
38TotalBsmtSF1460non-nullint64
39Heating1460non-nullobject
40HeatingQC1460non-nullobject
41CentralAir1460non-nullobject
42Electrical1459non-nullobject
431stFlrSF1460non-nullint64
442ndFlrSF1460non-nullint64
45LowQualFinSF1460non-nullint64
46GrLivArea1460non-nullint64
47BsmtFullBath1460non-nullint64
48BsmtHalfBath1460non-nullint64
49FullBath1460non-nullint64
50HalfBath1460non-nullint64
51BedroomAbvGr1460non-nullint64
52KitchenAbvGr1460non-nullint64
53KitchenQual1460non-nullobject
54TotRmsAbvGrd1460non-nullint64
55Functional1460non-nullobject
56Fireplaces1460non-nullint64
57FireplaceQu1460non-nullobject
58GarageType1460non-nullobject
59GarageYrBlt1379non-nullfloat64
60GarageFinish1460non-nullobject
61GarageCars1460non-nullint64
62GarageArea1460non-nullint64
63GarageQual1460non-nullobject
64GarageCond1460non-nullobject
65PavedDrive1460non-nullobject
66WoodDeckSF1460non-nullint64
67OpenPorchSF1460non-nullint64
68EnclosedPorch1460non-nullint64
693SsnPorch1460non-nullint64
70ScreenPorch1460non-nullint64
71PoolArea1460non-nullint64
72PoolQC1460non-nullobject
73Fence1460non-nullobject
74MiscFeature1460non-nullobject
75MiscVal1460non-nullint64
76MoSold1460non-nullobject
77YrSold1460non-nullint64
78SaleType1460non-nullobject
79SaleCondition1460non-nullobject
80SalePrice1460non-nullint64
dtypes:float64(3),int64(33),object(45)
memoryusage:924.0+KB
#herewehavecheckedthenumericalelementsofthedataframe
Unnamed:0 LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 729.500000 57.623288 10516.828082 6.099315 5.575342 1971.267808 1984.865753 103.117123 443.639726 46.549315
std 421.610009 34.664304 9981.264932 1.382997 1.112799 30.202904 20.645407 180.731373 456.098091 161.319273
min 0.000000 0.000000 1300.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 0.000000 0.000000
25% 364.750000 42.000000 7553.500000 5.000000 5.000000 1954.000000 1967.000000 0.000000 0.000000 0.000000
50% 729.500000 63.000000 9478.500000 6.000000 5.000000 1973.000000 1994.000000 0.000000 383.500000 0.000000
75% 1094.250000 79.000000 11601.500000 7.000000 6.000000 2000.000000 2004.000000 164.250000 712.250000 0.000000
max 1459.000000 313.000000 215245.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 1474.000000
8rows×36columns
['Unnamed:0',
'LotFrontage',
'LotArea',
'OverallQual',
'OverallCond',
'YearBuilt',
'YearRemodAdd',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'BsmtFullBath',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'Fireplaces',
'GarageYrBlt',
'GarageCars',
'GarageArea',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MiscVal',
'YrSold',
'SalePrice']
#herewehavecheckedthenumericalelementsofthedataframe
#describefunctionisusedtocheckthecategoricalelementsinthedataset
dataframe.describe()
#herewehavecreatedthevariablecallednumerical_feature_columns
#thenwearecreatingthelistofthevariablecallednumerical_feature_columns
#inthedataframetogetthenumericdatainthecolumns
#andthenweareprintingthenumerical_feature_columns
numerical_feature_columns=list(dataframe._get_numeric_data().columns)
numerical_feature_columns
#herewehavecreatedthevariablecalledcategorical_feature_columns
#intheentiredataframewehavethenumericalandthecategoricaldata
#setfunctionallowsustoremovetheduplicateelementsfromthedataframe
#andgettheuniqueelementsinthedataframe
#thenbyusingthesetfunctioninthelistwearecreatingthecolumnsoftheentiredataframe
#andagainbyusingthesetfunctionwearesubstractingthecolumnsofthenumericdataframe
#togetthecategoricalelements
#thenweareprintingthecategorical_feature_columns
categorical_feature_columns=list(set(dataframe.columns)-set(dataframe._get_numeric_data()
.columns))
categorical_feature_columns
['MoSold',
'MiscFeature',
'Utilities',
'GarageQual',
'PavedDrive',
'PoolQC',
'GarageCond',
'Fence',
'Condition2',
'LotConfig',
'FireplaceQu',
'BsmtCond',
'Street',
'Electrical',
'HeatingQC',
'Neighborhood',
'BsmtFinType2',
'Exterior2nd',
'LandSlope',
'LandContour',
'GarageType',
'MasVnrType',
'ExterCond',
'ExterQual',
'Alley',
'CentralAir',
'MSZoning',
'SaleType',
'Exterior1st',
'RoofMatl',
'BsmtQual',
'LotShape',
'KitchenQual',
'Functional',
'Foundation',
'SaleCondition',
'Condition1',
'RoofStyle',
'BldgType',
'BsmtExposure',
'MSSubClass',
'GarageFinish',
'HouseStyle',
'BsmtFinType1',
'Heating']
#herewehaveimportedtheseaborn
#hereweareplotitngthehistogrambyusingtheseabornlibrraytoplottheadvanced/
#interactuvegraphs
#configurationalcharacteristicswearetakingfromthematplotlib
#syntaxoftherangefunctioniswheretostart,wheretostop,thenumberofelements
#thestoparguemntisusedtoupperbondoftherangeherewearetryingtocontroltheincrement
#ofthedata
#herewearecreatingthecolumnsforthe'YearBuilt','TotalBsmtSF','GrLivArea','SalePrice'
#thenweareexecutingtheforloop
#wearedefiningtherange0thatisfrom1stindexpositiontotheentirelengthofthecolumns
#thatwehavecreated
#bydefaultthevalueis2aswehavestartedfromrange0thatisfromthestep1
#usingthepltfigurewearedescribingthesizeofthefigure10,4standsforlengthand
#bredthrespectively
#usingthepltsubplot121means1row,2columnsandineachcolumnwewantonly1graph
#subplot121-1istherow,2isthecolumnand1representstheindexor1stplot
#subplot122represents-1row,2columnsand2indexex
#distplotisthekindofhistogramwhichwillgivethedistributionofthedata
#byusingthesnsdistplotwearepassingeachcolumninthedataframeandkdeistheparameter
#wearepassinginthedistplot
#KDEstandsforkerneldensityestimation
#KDEisusedtopredictthePDFthatisprobablityofacontinousvariable
#weareusingthekde=falsetogetthegrapghicalformatotherwise
#wewillgetthebellshapedgraphtightlayout
#tightlayout-itwilltakecareofthesubplotthatwillbegeneratedinthedefinedfigure
#itwillnotgooutsidethedefinedfigure
#pltshowisusedtoshowthegraph
#distplotstandsfordistributionplot
#i+1meansonebyonecolumnplotprints
importseabornassns
num_cols=['YearBuilt','TotalBsmtSF','GrLivArea','SalePrice']
foriinrange(0,len(num_cols),2):
plt.figure(figsize=(10,4))
plt.subplot(121)
sns.distplot(dataframe[num_cols[i]],kde=False)
plt.subplot(122)
sns.distplot(dataframe[num_cols[i+1]],kde=False)
plt.tight_layout()
plt.show()
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\distributions.py:2619:FutureWarning:`distplot`isadepre
catedfunctionandwillberemovedinafutureversion.Pleaseadaptyourcodetouseeither`displot`(afigur
e-levelfunctionwithsimilarflexibility)or`histplot`(anaxes-levelfunctionforhistograms).
warnings.warn(msg,FutureWarning)
#herewehaveimportedtheseaborn
#hereweareplotitngthehistogrambyusingtheseabornlibrraytoplottheadvanced/
#interactuvegraphs
#configurationalcharacteristicswearetakingfromthematplotlib
#syntaxoftherangefunctioniswheretostart,wheretostop,thenumberofelements
#thestoparguemntisusedtoupperbondoftherangeherewearetryingtocontrol
#theincrementofthedata
#herewearecreatingthecolumnsforthe'YearBuilt','TotalBsmtSF','GrLivArea','SalePrice'
#thenweareexecutingtheforloop
#wearedefiningtherange0thatisfrom1stindexpositiontotheentirelengthofthecolumns
#thatwehavecreated
#bydefaultthevalueis2aswehavestartedfromrange0thatisfromthestep1
#usingthepltfigurewearedescribingthesizeofthefigure10,4standsforlengthand
#bredthrespectively
#usingthepltsubplot121means1row,2columnsandineachcolumnwewantonly1graph
#subplot121-1istherow,2isthecolumnand1representstheindexor1stplot
#subplot122represents-1row,2columnsand2indexex
#distplotisthekindofhistogramwhichwillgivethedistributionofthedata
#byusingthesnsdistplotwearepassingeachcolumninthedataframeandkdeistheparameter
#wearepassinginthedistplot
#KDEstandsforkerneldensityestimation
#KDEisusedtopredictthePDFthatisprobablityofacontinousvariable
#weareusingthekde=falsetogetthegrapghicalformatotherwisewewillgtthebellshaped
#graphtightlayout
#herewewanttocreatethehitogramandkde
#wehavehitogram=trueandkde=truetogetthebellshapedgraph
#tightlayout-itwilltakecareofthesubplotthatwillbegeneratedinthedefinedfigure
#itwillnotgooutsidethedefinedfigure
#pltshowisusedtoshowthegraph
#distplotstandsfordistributionplot
#i+1meansonebyonecolumnplotprints
num_cols=['YearBuilt','TotalBsmtSF','GrLivArea','SalePrice']
foriinrange(0,len(num_cols),2):
plt.figure(figsize=(10,4))
plt.subplot(121)
sns.distplot(dataframe[num_cols[i]],hist=True,kde=True)
plt.subplot(122)
sns.distplot(dataframe[num_cols[i+1]],hist=True,kde=True)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\distributions.py:2619:FutureWarning:`distplot`isadepre
catedfunctionandwillberemovedinafutureversion.Pleaseadaptyourcodetouseeither`displot`(afigur
e-levelfunctionwithsimilarflexibility)or`histplot`(anaxes-levelfunctionforhistograms).
warnings.warn(msg,FutureWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\distributions.py:2619:FutureWarning:`distplot`isadepre
catedfunctionandwillberemovedinafutureversion.Pleaseadaptyourcodetouseeither`displot`(afigur
e-levelfunctionwithsimilarflexibility)or`histplot`(anaxes-levelfunctionforhistograms).
warnings.warn(msg,FutureWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\distributions.py:2619:FutureWarning:`distplot`isadepre
catedfunctionandwillberemovedinafutureversion.Pleaseadaptyourcodetouseeither`displot`(afigur
e-levelfunctionwithsimilarflexibility)or`histplot`(anaxes-levelfunctionforhistograms).
warnings.warn(msg,FutureWarning)
plt.tight_layout()
plt.show()
#thebarrepresentsthehistogram
#thegrapghrepresentsthekde
#bydeafulthistparametertakethetruevalue
#herewehaveimportedtheseaborn
#hereweareplotitngthehistogrambyusingtheseabornlibrraytoplottheadvanced/
#interactuvegraphs
#configurationalcharacteristicswearetakingfromthematplotlib
#syntaxoftherangefunctioniswheretostart,wheretostop,thenumberofelements
#thestoparguemntisusedtoupperbondoftherange
#herewearetryingtocontroltheincrementofthedata
#herewearecreatingthecolumnsforthe'YearBuilt','TotalBsmtSF','GrLivArea','SalePrice'
#thenweareexecutingtheforloop
#wearedefiningtherange0thatisfrom1stindexpositiontotheentirelengthofthe
#columnsthatwehavecreated
#bydefaultthevalueis2aswehavestartedfromrange0thatisfromthestep1
#usingthepltfigurewearedescribingthesizeofthefigure10,4standsforlengthandbredth
#respectively
#usingthepltsubplot121means1row,2columnsandineachcolumnwewantonly1graph
#subplot121-1istherow,2isthecolumnand1representstheindexor1stplot
#subplot122represents-1row,2columnsand2indexex
#facet-itisusedtoinitializethedatasetsandthevariablethatisusedtogenerate
#thestructureoftheboxplot
#instaedoftakingtheerandomcolumnstakethedefinedcolumnsthatwhywehaveusedfacet
#similarlywehavecreatedthe2ndboxplotbyusingthei+1columnandthefacetisusedtoget
#thedefinedcolumns
#byusingthesnsboxplotwearepassingeachcolumninthedataframeandkdeistheparameter
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
#wearepassingintheboxplotandfacetisusedtogetthedefinedcolumns
#KDEstandsforkerneldensityestimation
#KDEisusedtopredictthePDFthatisprobablityofacontinousvariable
#weareusingthekde=falsetogetthegrapghicalformatotherwisewewillgtthebell
#shapedgraphtightlayout
#herewewanttocreatethehitogramandkde
#wehavehitogram=trueandkde=truetogetthebellshapedgraph
#tightlayout-itwilltakecareofthesubplotthatwillbegeneratedinthedefinedfigure
#itwillnotgooutsidethedefinedfigure
#pltshowisusedtoshowthegraph
#distplotstandsfordistributionplot
#i+1meansonebyonecolumnplotprints
num_cols=['YearBuilt','TotalBsmtSF','GrLivArea','SalePrice']
facet=None
foriinrange(0,len(num_cols),2):
plt.figure(figsize=(10,4))
plt.subplot(121)
sns.boxplot(facet,num_cols[i],data=dataframe)
plt.subplot(122)
sns.boxplot(facet,num_cols[i+1],data=dataframe)
plt.tight_layout()
plt.show()
#dataframeisthevariablethatwehavecreatedpreviously
#hereweareplottingthecountplotbyusingtheseabornlibraryfor
#thecategoricalvalueinthedatafromthevariablecalleddataframe
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
leasakeywordarg:x.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingother
argumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
<AxesSubplot:xlabel='SaleCondition',ylabel='count'>
<AxesSubplot:xlabel='count',ylabel='Neighborhood'>
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
sns.countplot('SaleCondition',data=dataframe)
#hereweareplottingthefigureofthesize12,6thatisthelengthandbredthrespectively
#bydeafultcountplotworksonxaxis
#thenbyusingthecountplotweareconstructingtheelementontheyaxisinthedatafrom
#thevariablecalleddataframe
plt.figure(figsize=(12,6))
sns.countplot(y='Neighborhood',data=dataframe)
#forplottingthelinearrelationshpweareplottingthelmplot
#thestraightlineshowsthelinearrelationship
#iffit_regisfalsewewillnotgetthelinearrelationshipline
#soherebyusingthesnslmplotwehaveplottedthelmplotfor4elements
#inthedatafromthevariablecreatedcalleddataframe
#torepresentthelinearrelationshipweareusingthefitreg
#bydefaultthelmplotwilltakefitreg=truetorepresentthelinearrelationship
sns.lmplot('GrLivArea','SalePrice',data=dataframe,fit_reg=True)
<seaborn.axisgrid.FacetGridat0x169bbfa9370>
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
<seaborn.axisgrid.JointGridat0x169bba26040>
#jointplotgeneratesforbyvirategraphandtheunivariategraph
#histogramworksfortheunivariategraphthatmeansforthesinglevariable
#thecurvelineshowstheKDE
#plotteddotsarethescatteredplots
#thelineintheoutputrepresentstheregressionline
#kind='reg'-herekindmeanswhatkindofgraph/
#plotyouwanttocreateandregstandsforregression
sns.jointplot('TotalBsmtSF','SalePrice',data=dataframe,kind='reg')
#jointplotgeneratesforbyvirategraphandtheunivariategraph
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
<seaborn.axisgrid.JointGridat0x169bbb00100>
<AxesSubplot:>
#histogramworksfortheunivariategraphthatmeansforthesinglevariable
#thecurvelineshowstheKDE
#plotteddotsarethescatteredplots
#thelineintheoutputrepresentstheregressionline
#kind='hex'-herekindmeanswhatkindofgraph/plotyouwanttocreate
#andregstandsforregression
#thedatapointsareshowninthehexagonalformat
sns.jointplot('YearBuilt','SalePrice',data=dataframe,kind='hex')
#hereweplottedtheheatmapofthefiguresize12,8whichisthelengthandbreadthrespectively
#heatmapwillbydefaultcalculatethecorrelationofthenumericalvalueswehave
#cmapstandsforcolormap
#herewearecreatingtheheatmapbyusingtheseabornlibrarywhichwill
#calculatethecorrelationinthepreviouslycreatedvariablecalleddataframe
#andwehavegiventhecolorofthemapasvirdis
plt.figure(figsize=(12,8))
sns.heatmap(dataframe.corr(),cmap='viridis')
<AxesSubplot:>
#k=10meanswehavegiventhenumberofvariablesforheatmap
#thenwehavecreatedthevariablescalledcolsthatiscolumnsforthe
#dataframeandcalculatethecorrelationandthelargestvalueforwhichwewant
#tocalculatethecorrelationis10andthesalespriceistretaedasanindex
#asitwillbethestartingelement
#thenwehavecreatedtheothervariablecalledcmanddefinethecolumnsinthedataframe
#tocalculatethecorrelation
#thenwearedefiningthefigureoftheplotthatis10,6meanslengthandbreadthoftheplot
#andcreatingtheheatmapbyusingthesebornlibraryforthevariablecmandgivenannotaion
#andcmapstandsforthecolorofthemapthatisvirdis
k=10
cols=dataframe.corr().nlargest(k,'SalePrice')['SalePrice'].index
cm=dataframe[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm,annot=True,cmap='viridis')
<seaborn.axisgrid.PairGridat0x169bc06f9a0>
#Visualizingrelationsbetweenallmajorvariables
#herewearecreatingthepairsplotsforeachvariables
#herewehavecreatedthevariablecalledcolsandchosenthecolumnsfor
#whichtheplottobeconstructed
#thenbyusingthepairplotwiththeseabornlibrarywehavecreatedthe
#columnspresentinthedataframes
cols=['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt']
sns.pairplot(dataframe[cols])
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
<AxesSubplot:xlabel='Neighborhood',ylabel='SalePrice'>
#herewehaveplottedthefigureofsize15,8thatisthelengthandthebreadthrespectively
#xticksrepresentsthenameofthedatapoints
#rotationstandsforatwhatanglewewanttoaligntheplot
#thenbyusingtheboxplotintheseabornlibraryforthetwovariablesinthedataframe
#andtheboxplotiscreated
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot('Neighborhood','SalePrice',data=dataframe)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\_decorators.py:36:FutureWarning:Passthefollowingvariab
lesaskeywordargs:x,y.Fromversion0.12,theonlyvalidpositionalargumentwillbe`data`,andpassingot
herargumentswithoutanexplicitkeywordwillresultinanerrorormisinterpretation.
warnings.warn(
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\categorical.py:1296:UserWarning:37.9%ofthepointscanno
tbeplaced;youmaywanttodecreasethesizeofthemarkersorusestripplot.
warnings.warn(msg,UserWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\categorical.py:1296:UserWarning:70.5%ofthepointscanno
tbeplaced;youmaywanttodecreasethesizeofthemarkersorusestripplot.
warnings.warn(msg,UserWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\categorical.py:1296:UserWarning:63.1%ofthepointscanno
tbeplaced;youmaywanttodecreasethesizeofthemarkersorusestripplot.
warnings.warn(msg,UserWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\categorical.py:1296:UserWarning:53.0%ofthepointscanno
tbeplaced;youmaywanttodecreasethesizeofthemarkersorusestripplot.
warnings.warn(msg,UserWarning)
C:\Users\Ankita\anaconda3\lib\site-packages\seaborn\categorical.py:1296:UserWarning:18.5%ofthepointscanno
tbeplaced;youmaywanttodecreasethesizeofthemarkersorusestripplot.
warnings.warn(msg,UserWarning)
<AxesSubplot:xlabel='OverallQual',ylabel='SalePrice'>
##herewehaveplottedthefigureofsize12,6thatisthelengthandthebreadthrespectively
#thenwehavecreatedtheswarmplotforthetwovariablesinthedatafrom
#thevariablecalleddataframe
#higherthevolumeofthedatapointsdenserwillbethegraph
plt.figure(figsize=(12,6))
sns.swarmplot('OverallQual','SalePrice',data=dataframe)
#crosstabisthenameofthegraph
#wehaveused2variablesthaisneighbourhoodandtheoverallqual
#firstlywehavecreatedthevariablecalledcrosstab
#thenbyusingthepandaslibrarywearecreatingthecrosstabfortheelement
#calledneighbourhoodinthedataframewhichwillbetheindexandhecolumn
#wearecreatingidfortheoverallquality
#thenwearecheckingtheresultsforthecrosstab
crosstab=pd.crosstab(index=dataframe["Neighborhood"],columns=dataframe["OverallQual"])
crosstab
#intheoutputisisshowingtherelationshipbetweenthenumericalandthecategoricalvalues
OverallQual 1 2 3 4 5 6 7 8 9 10
Neighborhood
Blmngtn 0 0 0 0 0 0 14 3 0 0
Blueste 0 0 0 0 0 2 0 0 0 0
BrDale 0 0 0 0 5 11 0 0 0 0
BrkSide 1 1 3 10 21 18 4 0 0 0
ClearCr 0 0 0 3 6 10 9 0 0 0
CollgCr 0 0 0 1 28 24 71 23 3 0
Crawfor 0 0 0 1 11 19 13 7 0 0
Edwards 1 0 5 25 38 23 5 0 1 2
Gilbert 0 0 0 1 3 33 36 5 1 0
IDOTRR 0 1 3 12 11 8 2 0 0 0
MeadowV 0 0 0 10 6 1 0 0 0 0
Mitchel 0 0 0 3 22 17 6 1 0 0
NAmes 0 0 3 20 113 74 12 3 0 0
NPkVill 0 0 0 0 0 9 0 0 0 0
NWAmes 0 0 0 0 8 36 26 3 0 0
NoRidge 0 0 0 0 0 0 12 24 1 4
NridgHt 0 0 0 0 0 2 16 27 24 8
OldTown 0 0 6 17 47 22 16 3 0 2
SWISU 0 1 0 2 9 10 3 0 0 0
Sawyer 0 0 0 9 54 11 0 0 0 0
SawyerW 0 0 0 2 8 23 21 5 0 0
Somerst 0 0 0 0 0 11 41 29 4 1
StoneBr 0 0 0 0 0 0 3 16 5 1
Timber 0 0 0 0 5 6 8 16 3 0
Veenker 0 0 0 0 2 4 1 3 1 0
<AxesSubplot:xlabel='Neighborhood'>
#herewearecreatingtheplotforcrosstabvariable
#thekindofplotwewanttocreateisbargraph
#thefiguresizegivenis12,8thatmeansthelengthandbreadthrespectively
#stackedmeansonevariableuponothervariable
#stacked=True-ifwewanttodesignthestackthengiveitastrueor
#ifyoudontwantthestackthengiveitasfalse
#thenwehavegiventhecolorofthemapaspaired
crosstab.plot(kind="bar",figsize=(12,8),stacked=True,colormap='Paired')
#herewehaveimportedtheseabornlibrary
#thenbyusingtheseabornlibrarywehavesetthestyleofthebackgroundaswhitegrid
#fisthevariablewearedefiningforax
#axistherepresentstheboundariesofthegraphandtodefinethat
#weareusingthesubplotandinthesubplotwearedefiningthefiguresizethat
#is15,10thatmeanslengthandbreadthrespectively
#axisrepresentstheaxisxandyaxis
importseabornassns
sns.set(style="whitegrid")
f,ax=plt.subplots(figsize=(15,10))
#Showeachdistributionwithbothviolinsandpoints
#herebyusingthesns.violinplotwehavepassedtheelementonthexaxisandtheyaxis
#inner-itistherepresentationofthedatapointsintheviolininterior
Text(0,0.5,'SalePrice')
#wehavedefinedtheboxvariableaswecandefinetheothervariableslikequartile
#point,stick
#palette-thisisusedtodefinethecolorformationorthecolorscheme
#cut-itisusedtodefinethedistanceinunitsofbandwidthsizeortoextentthedensitypass
#theextremedatapointsthatmeansthedensityorthedistancebetweenthedifferentdifferent
#datapoints
#linewidth-thisthewidthofthegreyline
#andaxisthematplotaxex
sns.violinplot(x="SaleCondition",y="SalePrice",data=dataframe,inner="box",palette="Set3",
cut=2,linewidth=3,
ax=ax)
#herewehaveusedthevariablecalleddespine
#despine-itisusedtoremovethespiness/abnormalcurvefromthegraphandused
#left=truemeanswearetryingtoremovetheleftspinessfromthedata
#thenwehavesetthetitleofthegraphattheboundaries
#thenwehavesetthexlabelattheboundarieandgiventhevalueforalphaas0.7
#similarlysettheylabelattheboundarieandgiventhevalueforalphaas0.7
#afterdefiningthetitlewehavedefinedthealphathatmeanswehavedefinedthevalue
#aplhaisakindofhuethatwearegivingthedifferentparametresasaplhaisakindof
#referencevariable
sns.despine(left=True)
ax.set_title('SaleConditionvs.SalesPrice')
ax.set_xlabel("SaleCondition",alpha=0.7)
ax.set_ylabel("SalePrice",alpha=0.7)
#herewehaveimportedtheseabornlibrary
#thenbyusingtheseabornlibrarywehavesetthestyleofthebackgroundaswhitegrid
#fisthevariablewearedefiningforax
#axistherepresentstheboundariesofthegraphandtodefinethat
#weareusingthesubplotandinthesubplotwearedefiningthefiguresizethatis15,10
#thatmeanslengthandbreadthrespectively
#axisrepresentstheaxisxandyaxis
importseabornassns
sns.set(style="whitegrid")
f,ax=plt.subplots(figsize=(15,10))
#Showeachdistributionwithbothviolinsandpoints
#herebyusingthesns.violinplotwehavepassedtheelementonthexaxisandtheyaxis
#inner-itistherepresentationofthedatapointsintheviolininterior
#wehavedefinedtheboxvariableaswecandefinetheothervariableslikequartile
#point,stick
#palette-thisisusedtodefinethecolorformationorthecolorscheme
#cut-itisusedtodefinethedistanceinunitsofbandwidthsizeortoextentthedensity
#passtheextremedatapointsthatmeansthedensityorthedistancebetweenthedifferent
Text(0,0.5,'SaleCondition')
#differentdatapoints
#linewidth-thisthewidthofthegreyline
#andaxisthematplotaxex
sns.violinplot(x="SalePrice",y="SaleCondition",data=dataframe,palette="Set3",inner="points",
bw=.2,cut=2,
linewidth=3,ax=ax)
#bw-itiscomputingthebandwidthofthekernel
#herewehaveusedthevariablecalleddespine
#despine-itisusedtoremovethespiness/abnormalcurvefromthegraphandused
#left=truemeanswearetryingtoremovetheleftspinessfromthedata
#thenwehavesetthetitleofthegraphattheboundaries
#thenwehavesetthexlabelattheboundarieandgiventhevalueforalphaas0.7
#similarlysettheylabelattheboundarieandgiventhevalueforalphaas0.7
#afterdefiningthetitlewehavedefinedthealphathatmeanswehavedefinedthevalue
#aplhaisakindofhuethatwearegivingthedifferentparametresas
#aplhaisakindofreferencevariable
sns.despine(left=True)
ax.set_title('SalesPricevs.SaleCondition')
ax.set_xlabel("SalesPrice",alpha=0.7)
ax.set_ylabel("SaleCondition",alpha=0.7)
#herewehaveimportedtheseabornlibrary
#thenbyusingtheseabornlibrarywehavesetthestyleofthebackgroundaswhitegrid
#fisthevariablewearedefiningforax
#axistherepresentstheboundariesofthegraphandtodefinethatweareusingthe
#subplotandinthesubplotwearedefiningthefiguresizethatis15,10that
#meanslengthandbreadthrespectively
#axisrepresentstheaxisxandyaxis
importseabornassns
sns.set(style="whitegrid")
f,ax=plt.subplots(figsize=(15,10))
#hereweareplottingthescatterplotbyusingtheseabornlibraryfromthedatawiththe
#previouslydefinedvariablecalleddataframe
#settingtheelementsforxandyaxis
#legendsisusedifwewanttogivethelengendslikethepropergraphformation
#thenwedefinethesize
#thenwedefinetheaxesthatmeanswearedefiningtheboundaries
#thenwearesettingthetitleforthexandyaxisattheboundaries
#thenwearesettingthexlabelandtheylabel
#andalpha=0.7meansitisakindofhuethatwearegivingthedifferentparametres
#asaplhaisakindofreferencevariable
sns.scatterplot(data=dataframe,x="SaleCondition",y="SalePrice",legend=False,
sizes=(20,2000),ax=ax)
Text(0,0.5,'SalePrice')
Text(0,0.5,'SalePrice')
ax.set_title('SaleConditionvs.SalesPriceforDifferentNeighborhood')
ax.set_xlabel("SaleCondition",alpha=0.7)
ax.set_ylabel("SalePrice",alpha=0.7)
#herewehaveimportedtheseabornlibrary
#thenbyusingtheseabornlibrarywehavesetthestyleofthebackgroundaswhitegrid
#fisthevariablewearedefiningforax
#axistherepresentstheboundariesofthegraphand
#todefinethatweareusingthesubplotandinthesubplot
#wearedefiningthefiguresizethatis15,10thatmeanslengthandbreadthrespectively
#axisrepresentstheaxisxandyaxis
importseabornassns
sns.set(style="whitegrid")
f,ax=plt.subplots(figsize=(15,10))
##hereweareplottingthescatterplotbyusingtheseabornlibrary.
#fromthedatawiththepreviouslydefinedvariablecalleddataframe
#settingtheelementsforxandyaxis
#thesizeisneighbourhoodbecausethesizeofthebubbleisaccordingtothenumberofthe
#neighbourhooddatapoints
#largertheneighbourhooddatapointslargerthebubblesize
#hueisthebackgroundreference
#legendsisusedifwewanttogivethelengendslikethepropergraphformation
#thenwedefinethesize
#thenwedefinetheaxesthatmeanswearedefiningtheboundaries
#thenwearesettingthetitleforthexandyaxisattheboundaries
#thenwearesettingthexlabelandtheylabel
#andalpha=0.7meansitisakindofhuethatwearegivingthedifferentparametres
#asaplhaisakindofreferencevariable
sns.scatterplot(data=dataframe,x="SaleCondition",y="SalePrice",size="Neighborhood",
hue="Neighborhood",alpha=0.5,legend=False,sizes=(20,2000),ax=ax)
ax.set_title('SaleConditionvs.SalesPriceforDifferentNeighborhood')
ax.set_xlabel("SaleCondition",alpha=0.7)
ax.set_ylabel("SalePrice",alpha=0.7)
#scatterplotidfromthesnslibrary
item_rating criteria
0 1 PackagingCost
1 5 FoodQuality
2 2 IngredientsCost
3 4 MarketDemand
4 5 CustomerRating
#hereweareplottingthelibrarycalledplotlyandimportingthemodulecalledexpresscalledpx
#thenimportingthepandasaspdtocreatethedataframe
#hereexpresswillworksthewrapper
#wrapperisthemethodtoencapsulateallthecodesthatmeansgettingallthecodesinonechunk
#thenbyusingthepandaslibrarywehavecreatedthedataframe
#thatisconvertedintothedictinorythatisfooddata
#theitemratingsaregiveninthelistthatisfrom1to5andthecreteriaisalsogiven
#inthelistanditemratingsandthecreteriaisinthedictinary
#andcheckingtheheadofthedictianrycreatedthatisfooddata
importplotly.expressaspx
importpandasaspd
food_data=pd.DataFrame(dict(
item_rating=[1,5,2,4,5],
criteria=['PackagingCost','FoodQuality','IngredientsCost',
'MarketDemand','CustomerRating']))
food_data.head()
#forthisdatawewilldefinetheradarchart
#hereweareplottingthelibrarycalledplotlyandimportingthemodulecalledexpresscalledpx
#forafilledlineintheradarchartusepx.linepolarwithfigupdatetraces
#fordesigningtheradarchartwewillusethelinepolarandtheupdatetracesmethod
#herewehavecreatedthevariablecalledfigandusedthepx.linepolar
#forthedictionarynamedfooddataandselectedtheradiusasitemrating
#thetaistheangleorthecreteriaonwhichtheplotistobedefined
#thenwehavechoselineclose=truetodefinetheclosenessoftheselines
#wehaveusedfig.update_traces(fill='toself')becausethefigupdatetraceswilltakethe
#selfdefinedvariables
#fig.show()-wehaveusedthistoshowtheplot
C:\Users\Ankita\anaconda3\lib\site-packages\plotly\express\_core.py:271:FutureWarning:Theframe.appendmethod
isdeprecatedandwillberemovedfrompandasinafutureversion.Usepandas.concatinstead.
trace_data=trace_data.append(trace_data.iloc[0])
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
concave
points_mean
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430
5rows×32columns
fig=px.line_polar(food_data,r='item_rating',theta='criteria',line_close=True)
fig.update_traces(fill='toself')
fig.show()
PackagingCost
FoodQuality
IngredientsCostMarketDemand
CustomerRating
0 1 2 3 4 5
#featureselection
#dimenssion-itmeansnumbersofvariables
#dimenssionalityreduction-itmeanstoreducethenumbeofvariables
#featureselection-itisamethodtoselectavariablessothat
#wecanbuildamodelwiththegoodpower
#PCAisthelinearcomponentthatyougotonceyouperformthepca
#pcahas2componentsthatiseigenvaluesandtheeigenvectors
#ithas2propertiesthatismagnitude-itisthevaluethatalignofthespecificlength
#direction-itrepresentsthelineisinwhichdirection
#magnitudeiscalledaseigenvaluesandthedirectioniscalledeigenvectors
#wegotmagnitudeofthelinewhenwewillapplyPCA
#importingtherequiredlibraries
#matplotlibisusedforthevisualization
importmatplotlib.pyplotasplt
importpandasaspd
importnumpyasnp
%matplotlibinline
#herewehavecreatedthevariablecalleddf
#byusingthepandaslibrarywearereadingthecsvfile
#thenwearecheckingtheinitialrowsandcolumnsofthedatasetdf
df=pd.read_csv('breast-cancer-data.csv')
df.head()
#checkingtheshapeofthedf
(569,32)
<class'pandas.core.frame.DataFrame'>
RangeIndex:569entries,0to568
Datacolumns(total32columns):
#ColumnNon-NullCountDtype
----------------------------
0id569non-nullint64
1diagnosis569non-nullobject
2radius_mean569non-nullfloat64
3texture_mean569non-nullfloat64
4perimeter_mean569non-nullfloat64
5area_mean569non-nullfloat64
6smoothness_mean569non-nullfloat64
7compactness_mean569non-nullfloat64
8concavity_mean569non-nullfloat64
9concavepoints_mean569non-nullfloat64
10symmetry_mean569non-nullfloat64
11fractal_dimension_mean569non-nullfloat64
12radius_se569non-nullfloat64
13texture_se569non-nullfloat64
14perimeter_se569non-nullfloat64
15area_se569non-nullfloat64
16smoothness_se569non-nullfloat64
17compactness_se569non-nullfloat64
18concavity_se569non-nullfloat64
19concavepoints_se569non-nullfloat64
20symmetry_se569non-nullfloat64
21fractal_dimension_se569non-nullfloat64
22radius_worst569non-nullfloat64
23texture_worst569non-nullfloat64
24perimeter_worst569non-nullfloat64
25area_worst569non-nullfloat64
26smoothness_worst569non-nullfloat64
27compactness_worst569non-nullfloat64
28concavity_worst569non-nullfloat64
29concavepoints_worst569non-nullfloat64
30symmetry_worst569non-nullfloat64
31fractal_dimension_worst569non-nullfloat64
dtypes:float64(30),int64(1),object(1)
memoryusage:142.4+KB
df.shape
#Checkthedata,thereshouldbenomissingvalues
#checkingtheinformationpresentinthedata
df.info()
#definingthearrayasnp.array
#herewehavecreatedthevariablecalledfeaturenames
#byusingthenumpyarraywearecreatingallthefeaturesthatareavailable
feature_names=np.array(['meanradius''meantexture''meanperimeter''meanarea'
'meansmoothness''meancompactness''meanconcavity'
'meanconcavepoints''meansymmetry''meanfractaldimension'
'radiuserror''textureerror''perimetererror''areaerror'
'smoothnesserror''compactnesserror''concavityerror'
'concavepointserror''symmetryerror''fractaldimensionerror'
'worstradius''worsttexture''worstperimeter''worstarea'
'worstsmoothness''worstcompactness''worstconcavity'
'worstconcavepoints''worstsymmetry''worstfractaldimension'])
#hereweareimportingthemodulecalledlabelencoderfromthepreprocessinginthesklearnlibrary
fromsklearn.preprocessingimportLabelEncoder
#Encodelabeldiagnosis
#Convertingdiagnosistonumericalvariableindf
#hereinthediagnosiscolumninthedatasetwearemappingallthevaluesof
#Mto1andBto0intoourdata
#andwewantthis1and0asintthatisinteger
#1willbetakenbyMand0willbetakenbyB
#herewecannotusethelabelencoderasitwillcreatethe2columnsforit
#soherewearereplacingthevaluesofcategoricalintothenumericalsowearemappingthevalues
#astypeintmeansthatinpythontheconversionwilltakeplaceintothefloatsohere
#wehavementionedastypeasinteger
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0}).astype(int)
#Installfactoranalyzer
#factorsarepossibletocreateornot-thisiscalledasthefactoranalysis
#!pipinstallfactor_analyzer
#herewehaveusedjupytersowehaveusedexlamatorymark!
Requirementalreadysatisfied:factor_analyzerinc:\users\ankita\anaconda3\lib\site-packages(0.4.1)
Requirementalreadysatisfied:numpyinc:\users\ankita\anaconda3\lib\site-packages(fromfactor_analyzer)(1.2
1.5)
Requirementalreadysatisfied:scikit-learninc:\users\ankita\anaconda3\lib\site-packages(fromfactor_analyze
r)(1.0.2)
Requirementalreadysatisfied:pandasinc:\users\ankita\anaconda3\lib\site-packages(fromfactor_analyzer)(1.
4.4)
Requirementalreadysatisfied:pre-commitinc:\users\ankita\anaconda3\lib\site-packages(fromfactor_analyzer)
(2.21.0)
Requirementalreadysatisfied:scipyinc:\users\ankita\anaconda3\lib\site-packages(fromfactor_analyzer)(1.9
.1)
Requirementalreadysatisfied:python-dateutil>=2.8.1inc:\users\ankita\anaconda3\lib\site-packages(frompand
as->factor_analyzer)(2.8.2)
Requirementalreadysatisfied:pytz>=2020.1inc:\users\ankita\anaconda3\lib\site-packages(frompandas->factor
_analyzer)(2022.1)
Requirementalreadysatisfied:nodeenv>=0.11.1inc:\users\ankita\anaconda3\lib\site-packages(frompre-commit-
>factor_analyzer)(1.7.0)
Requirementalreadysatisfied:identify>=1.0.0inc:\users\ankita\anaconda3\lib\site-packages(frompre-commit-
>factor_analyzer)(2.5.12)
Requirementalreadysatisfied:virtualenv>=20.10.0inc:\users\ankita\anaconda3\lib\site-packages(frompre-com
mit->factor_analyzer)(20.17.1)
Requirementalreadysatisfied:cfgv>=2.0.0inc:\users\ankita\anaconda3\lib\site-packages(frompre-commit->fac
tor_analyzer)(3.3.1)
Requirementalreadysatisfied:pyyaml>=5.1inc:\users\ankita\anaconda3\lib\site-packages(frompre-commit->fac
tor_analyzer)(6.0)
Requirementalreadysatisfied:joblib>=0.11inc:\users\ankita\anaconda3\lib\site-packages(fromscikit-learn->
factor_analyzer)(1.1.0)
Requirementalreadysatisfied:threadpoolctl>=2.0.0inc:\users\ankita\anaconda3\lib\site-packages(fromscikit
-learn->factor_analyzer)(2.2.0)
Requirementalreadysatisfied:setuptoolsinc:\users\ankita\anaconda3\lib\site-packages(fromnodeenv>=0.11.1-
>pre-commit->factor_analyzer)(63.4.1)
Requirementalreadysatisfied:six>=1.5inc:\users\ankita\anaconda3\lib\site-packages(frompython-dateutil>=2
.8.1->pandas->factor_analyzer)(1.16.0)
Requirementalreadysatisfied:distlib<1,>=0.3.6inc:\users\ankita\anaconda3\lib\site-packages(fromvirtualen
v>=20.10.0->pre-commit->factor_analyzer)(0.3.6)
Requirementalreadysatisfied:platformdirs<3,>=2.4inc:\users\ankita\anaconda3\lib\site-packages(fromvirtua
lenv>=20.10.0->pre-commit->factor_analyzer)(2.5.2)
Requirementalreadysatisfied:filelock<4,>=3.4.1inc:\users\ankita\anaconda3\lib\site-packages(fromvirtuale
nv>=20.10.0->pre-commit->factor_analyzer)(3.6.0)
(40196.161163008626,0.0)
C:\Users\Ankita\anaconda3\lib\site-packages\factor_analyzer\utils.py:244:UserWarning:Theinverseofthevaria
nce-covariancematrixwascalculatedusingtheMoore-Penrosegeneralizedmatrixinversion,duetoitsdetermina
ntbeingatorveryclosetozero.
warnings.warn(
0.26062906217560566
!pipinstallfactor_analyzer
#ifwereusingthecondapromptthenwewritepipinstallandthenameoflibrary
#andifyouareusingjupyterthenuse!pipinstallnameofthelibrary
#hereweareimportingtheFactorAnalyzerfromthefactor_analyzerlibrary
#thenweareimportingcalculate_bartlett_sphericityfromthefactor_analyzer.factor_analyzerlibrary
#thenwearecalculatingthebartlett_sphericitythatmeansfactorsarepossibletocreateornot
#thenweare2valuesthatischisquarevalueandthepsquarevalue
#chisquarevalue-itisthemethodtocalculatethecorrerationbetweenthecategoricalvariables
#psquarevalue-pvalueistheresultantvalue
#identitymatrixisthemetarixinwhichwehaveoneinthediagonalandrestofthevaluesare0
fromfactor_analyzerimportFactorAnalyzer
fromfactor_analyzer.factor_analyzerimportcalculate_bartlett_sphericity
#herewearecalculatingthebartlett_sphericityofthedatasetthatisdfby
#usingthechisquarevalueandthepvalue
#thenwearecheckingthechisquarevalueandthepvalue
chi_square_value,p_value=calculate_bartlett_sphericity(df)
chi_square_value,p_value
#hereweareimportingthemodulecalledcalculatekmofromthefactor_analyzer.factor_analyzer
#library
#kmodefineswheatherweareabletomakethefactoreornot
#whywehaveused2timesfactoranalyzer
#whywehaveusedkmoall
fromfactor_analyzer.factor_analyzerimportcalculate_kmo
kmo_all,kmo_model=calculate_kmo(df)
#herewehavecheckedthekmomodel
#0.26isthevaluewearegettingofkmo
kmo_model
#herewearecheckingtheinitial2rowsofthedata
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
concave
points_mean
0 842302 1 17.99 10.38 122.8 1001.0 0.11840 0.27760 0.3001 0.14710
1 842517 1 20.57 17.77 132.9 1326.0 0.08474 0.07864 0.0869 0.07017
2rows×32columns
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
id 1.000000 0.039769 0.074626 0.099770 0.073159 0.096893 -0.012968 0.000096
diagnosis 0.039769 1.000000 0.730029 0.415185 0.742636 0.708984 0.358560 0.596534
radius_mean 0.074626 0.730029 1.000000 0.323782 0.997855 0.987357 0.170581 0.506124
texture_mean 0.099770 0.415185 0.323782 1.000000 0.329533 0.321086 -0.023389 0.236702
perimeter_mean 0.073159 0.742636 0.997855 0.329533 1.000000 0.986507 0.207278 0.556936
area_mean 0.096893 0.708984 0.987357 0.321086 0.986507 1.000000 0.177028 0.498502
smoothness_mean -0.012968 0.358560 0.170581 -0.023389 0.207278 0.177028 1.000000 0.659123
compactness_mean 0.000096 0.596534 0.506124 0.236702 0.556936 0.498502 0.659123 1.000000
concavity_mean 0.050080 0.696360 0.676764 0.302418 0.716136 0.685983 0.521984 0.883121
concavepoints_mean 0.044158 0.776614 0.822529 0.293464 0.850977 0.823269 0.553695 0.831135
symmetry_mean -0.022114 0.330499 0.147741 0.071401 0.183027 0.151293 0.557775 0.602641
fractal_dimension_mean -0.052511 -0.012838 -0.311631 -0.076437 -0.261477 -0.283110 0.584792 0.565369
radius_se 0.143048 0.567134 0.679090 0.275869 0.691765 0.732562 0.301467 0.497473
texture_se -0.007526 -0.008303 -0.097317 0.386358 -0.086761 -0.066280 0.068406 0.046205
perimeter_se 0.137331 0.556141 0.674172 0.281673 0.693135 0.726628 0.296092 0.548905
area_se 0.177742 0.548236 0.735864 0.259845 0.744983 0.800086 0.246552 0.455653
smoothness_se 0.096781 -0.067016 -0.222600 0.006614 -0.202694 -0.166777 0.332375 0.135299
compactness_se 0.033961 0.292999 0.206000 0.191975 0.250744 0.212583 0.318943 0.738722
concavity_se 0.055239 0.253730 0.194204 0.143293 0.228082 0.207660 0.248396 0.570517
concavepoints_se 0.078768 0.408042 0.376169 0.163851 0.407217 0.372320 0.380676 0.642262
symmetry_se -0.017306 -0.006522 -0.104321 0.009127 -0.081629 -0.072497 0.200774 0.229977
fractal_dimension_se 0.025725 0.077972 -0.042641 0.054458 -0.005523 -0.019887 0.283607 0.507318
radius_worst 0.082405 0.776454 0.969539 0.352573 0.969476 0.962746 0.213120 0.535315
texture_worst 0.064720 0.456903 0.297008 0.912045 0.303038 0.287489 0.036072 0.248133
perimeter_worst 0.079986 0.782914 0.965137 0.358040 0.970387 0.959120 0.238853 0.590210
area_worst 0.107187 0.733825 0.941082 0.343546 0.941550 0.959213 0.206718 0.509604
smoothness_worst 0.010338 0.421465 0.119616 0.077503 0.150549 0.123523 0.805324 0.565541
compactness_worst -0.002968 0.590998 0.413463 0.277830 0.455774 0.390410 0.472468 0.865809
concavity_worst 0.023203 0.659610 0.526911 0.301025 0.563879 0.512606 0.434926 0.816275
concavepoints_worst 0.035174 0.793566 0.744214 0.295316 0.771241 0.722017 0.503053 0.815573
symmetry_worst -0.044224 0.416294 0.163953 0.105008 0.189115 0.143570 0.394309 0.510223
fractal_dimension_worst -0.029866 0.323872 0.007066 0.119205 0.051019 0.003738 0.499316 0.687382
#herewearecheckingtheinitial2rowsofthedata
df.head(2)
#herewehavedefinedthevariablecalledcorr
#thenwewillcalculatethecorrelationofthedataframe
#whilecalculatingthecorrelationthecolourmappingwillbedoneonthegradientofthe
#correlation
#colormappingwillbedoneonthebasisofcoolwarm
#coolwarm-itisthetypeoflisttogetthecolourmaphere
#gradient-approachtominimizethelocalminima
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm')
#herewearedefiningthecorrelationinthedatasetcalleddfforthedifferentcolumnelements
#whichisshowingthehighestcorrelationinthem
df_corr=df[['radius_mean','perimeter_mean','area_mean','radius_worst','perimeter_worst',
'area_worst','concavity_mean','concavepoints_mean','concavity_worst',
'concavepoints_worst','diagnosis']]
#hereweareimportingthecalculate_kmomodulefromthefactor_analyzer.factor_analyzerlibrary
0.8260787423549154
FactorAnalyzer(rotation='varimax',rotation_kwargs={})
array([9.09178108e+00,1.15744917e+00,3.30050451e-01,1.75328755e-01,
1.20114615e-01,8.46813558e-02,2.04348144e-02,1.33778025e-02,
4.91643818e-03,1.55811996e-03,3.07397281e-04])
#hereweareimportingthecalculate_kmomodulefromthefactor_analyzer.factor_analyzerlibrary
##kmodefineswheatherweareabletomakethefactoreornot
#whywehaveused2timesfactoranalyzer
#whywehaveusedkmoall
#thenwearecalculatingthekmoforthedefinedcorrelation
#thenwearecheckingfothekmomodel
fromfactor_analyzer.factor_analyzerimportcalculate_kmo
kmo_all,kmo_model=calculate_kmo(df_corr)
kmo_model
#herewehaveimportedtheFactorAnalyzermodulefromtheFactorAnalyzerlibrary
#thenwewillanalyzethefactors
#thenwehavecreatedcalledfamethodforfactoranalyzer
#thenwearepassingoneparameterthroughFactorAnalyzerthatisrotation
#wearepassingtherotationmethodasvarimax
#varimax-itmeansorthogonalrotation,hereorthogonalmeanswhatever
#thefactoranalyzerwillfindsthefactorsitwillrotateto90degressthatisorthogonal
#weareusingtherotationtoclarifytherelationshipbetweenthefactors
#andtoachivethisweusetheconceptofrotatio
#df_corrthisisthedataframethatwehavecreatedpreviouslyandapplyingthefactoranalyzer
#methodonthisdataframe
#fit-wearefeedingthedataintothefavariablesoweareusingfitmethod
#fitisbasicallyusedtofeedthedatainthealgorithm
#thenwehaveused11becausepreviouslywehavepassedthe11features
fromfactor_analyzerimportFactorAnalyzer
fa=FactorAnalyzer(rotation='varimax')
fa.fit(df_corr,11)
#herewearecalculatingtheeigenvalueforeachfactor
#thenwehaevcreatedthevariablecalledvforcalcultaingtheeigenvaluesthatisev
#thenwehaveusedgetmethodtogettheeigenvaluesfromthe
#fathatwasdesignedforfactoranalyzer
#thenwehavecheckedthevalueforevthatiseogenvalue
ev,v=fa.get_eigenvalues()
ev
#herewewillplotthescatterplotthatisplt.scatterisusedtodesign
#thescatterplotthenwehavegiventherangeof1thenwehavegiventhenameofthe
#dataframenamethatisdf_corrthenwehavegiventheshapeandspecifiedtheeigenvaluesasev
#thenwehaveplt.plottoplotthegraphthenwehavegiventherangeof1thenwehavegiven
#thenameofthedataframethatisdf_corrthenwehavegiventheshapeandspecified
#theeigenvaluesasev
#usingthescatterwearedefiningthegrislinesonthegraphand
#usingtheplotweareplottingthegraph
#weareusingtherange1becausetherangefunctiononlyworkswithn-1
#andwewillleftwithonevaluesotomentionthenvaluewehavetodefinethe+1foreigenvalue
#+1isusedtocalculatethevalueofnanditspecifiestheeigenvaluethatisev
#thenwehavegiventhetitleoftheplotthatisscreeplot
#thenwehavegiventhenameofxlabelandtheylabel
#thenwehaveplotthegridtogetthegridlines
#thenwehaveusedtheshowmethodtoshowthegraph
#factorsarethestatisticalvalueswearetryingtofindandwearesimply
#passingittothefactoranalyzerandafterpassingitwillgivethefinaloutput
plt.scatter(range(1,df_corr.shape[1]+1),ev)
plt.plot(range(1,df_corr.shape[1]+1),ev)
plt.title('ScreePlot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
FactorAnalyzer(n_factors=2,rotation='varimax',rotation_kwargs={})
array([[0.09949821,-0.0175289],
[0.76990049,0.2623415],
[0.97577564,-0.05180551],
[0.37374525,0.08592885],
[0.97834606,0.0038596],
[0.97757835,-0.04299328],
[0.21011043,0.61739299],
[0.54025265,0.78340655],
[0.71664403,0.63252346],
[0.8480132,0.44148549],
[0.19103234,0.6146844],
[-0.26532101,0.86590234],
[0.7431245,0.18542383],
[-0.04891221,0.18458444],
[0.73886249,0.2275271],
[0.79163253,0.08879413],
[-0.18452396,0.38701832],
[0.22456327,0.78529353],
[0.2181755,0.66739594],
[0.3904135,0.59545103],
[-0.08217263,0.40098444],
[-0.03298699,0.73180421],
[0.98993514,-0.00563542],
[0.36256059,0.11401422],
[0.99041425,0.05315551],
[0.97481137,-0.01017728],
[0.18911152,0.5607166],
[0.46178052,0.6952383],
[0.5755513,0.64392355],
[0.78203771,0.47681393],
[0.20855172,0.49045164],
[0.06575214,0.79707143]])
#herewehavecreatedthevariavlecalledfa1forthefactoranalyzer
#therortationmethodusedisvariamx
#andwehaveusednfactors=2becauseintheabovegraphwehaveonly2factorsabovetheelbow
#thenwehaveusedthefitmethodtofitthenumberoffactorsinthedataframe
#inthenewelycreatedvariablecalledfa1
#dfisthedatasettheisperviosulycreatedandonwhichweareworking
#n_factorsmeansitisthenumbersoffactors
fa1=FactorAnalyzer(rotation="varimax",n_factors=2)
fa1.fit(df,2)
#herewearecheckingthefa1loadings
#loadingsisthefunctionthroughwhichwegetthevaluesofthefactors
fa1.loadings_
#CFAstandsforConfirmatoryFactorAnalysis.ThislineimportsCFAfunction
#hereweareimportingConfirmatoryFactorAnalyzer,ModelSpecificationParsermodules
#fromthefactoranalyzerlibrary
fromfactor_analyzerimport(ConfirmatoryFactorAnalyzer,ModelSpecificationParser)
fromfactor_analyzerimport(ConfirmatoryFactorAnalyzer,ModelSpecificationParser)
C:\Users\Ankita\anaconda3\lib\site-packages\factor_analyzer\confirmatory_factor_analyzer.py:753:UserWarning:
Theoptimizationroutinefailedtoconverge:ABNORMAL_TERMINATION_IN_LNSRCH
ConfirmatoryFactorAnalyzer(disp=False,n_obs=569,
specification=<factor_analyzer.confirmatory_factor_analyzer.ModelSpecificationobjec
tat0x00000169C295C9D0>)
array([[6.69483034e-01,0.00000000e+00],
[1.79500917e+02,0.00000000e+00],
[2.35439189e+03,0.00000000e+00],
[2.98426589e+01,0.00000000e+00],
[2.07085357e+02,0.00000000e+00],
[3.96886695e+03,0.00000000e+00],
[0.00000000e+00,4.26104697e+01],
[0.00000000e+00,-1.50190334e+01],
[0.00000000e+00,8.46438659e+01],
[0.00000000e+00,6.53476942e+01],
[0.00000000e+00,-2.79595597e+00]])
array([[1.,0.06061904],
[0.06061904,1.]])
array([[0.26675751,-0.31745394],
[0.27272276,0.13956624],
[0.21225096,-0.06370616],
...,
[0.06518867,0.03192635],
[0.24091269,-0.47575006],
[-0.16168669,0.04700808]])
#thenwehavecreatedthevariablecalledmodeldict
#inmodel_dictwehavecreatedthelistinthedictinary
#thenwearehavingthe2keysthatiscolumnf1andcolumnf2
#f1hasallthevaluesandthef2columnhaveallthesevariables
model_dict={"F1":['radius_mean','perimeter_mean','area_mean','radius_worst',
'perimeter_worst','diagnosis'],"F2":['area_worst','concavity_mean',
'concavepoints_mean','concavity_worst','concavepoints_worst']}
#herewehavecreatedthevariblecalledmodelspecifications
#thenthemodelspecificationwearedefiningastheModelSpecificationParser
#thenparsermodelspecificationweareimportingthespecificationfromdictnary
#andpassingthedf_corrpreviosulycreatedvariableandthemodeldictonary
#wehavecreatedfror2factors
model_spec=ModelSpecificationParser.parse_model_specification_from_dict(df_corr,model_dict)
#Performsconfirmatoryfactoranalysis
#herewehavecreatedthemethodcalledcfa1forConfirmatoryFactorAnalyzer
#andpassingthemodelspecificationtothatfactoranalyzer
#disp=False-thismeanswedontwanttoshowthedisplacemenet
#sowearepassingdisplacement=false
#andalsowedontwantthestatisticalsummarysowehavegivendisp=false
#model_spec-thisisthevariablethatwearepasisngthoughthecfa1
#thenweusedthefitmethodtogetthevaluesofthepreviosulydefinedvariablethat
#isdf_corrintothecfa1
cfa1=ConfirmatoryFactorAnalyzer(model_spec,disp=False)
cfa1.fit(df_corr.values)
#cfa1.loadings_willgaveyouthefactorloadingmatrix
#Thefactorloadingisamatrixwhichshowstherelationship
#ofeachvariabletotheunderlyingfactor.
#Itshowsthecorrelationcoefficientforobservedvariableandfactor.
#Itshowsthevarianceexplainedbytheobservedvariable
#herewewillgetallthevaluesofthefactors
cfa1.loadings_
#Thiswillgiveyouthefactorcovariancematrixandthetypeofthiswillbenumpyarray
#herewearecalculatingthefactorvariablecovarianceandpassingitthroghthecfa1
#theoutputsarethefactorscovariances
cfa1.factor_varcovs_
#transform(X)usedtogetthefactorscoresfornewdataset.
#Parameters:X(array-like,shape(n_samples,n_features))
#–Thedatatoscoreusingthefittedfactormodel.
#Returns:scores–ThelatentvariablesofX.
#Returntype:numpyarray,shape(n_samples,n_components)
#herewearecalculatingthevaluesofthepreviosulydefinedvariable
#thatisdf_corrandtransformingitandpassingitthroughthecfa1
cfa1.transform(df_corr.values)
#importingtherequiredlibraies
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
concave
points_mean
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430
5rows×32columns
(569,32)
<class'pandas.core.frame.DataFrame'>
RangeIndex:569entries,0to568
Datacolumns(total32columns):
#ColumnNon-NullCountDtype
----------------------------
0id569non-nullint64
1diagnosis569non-nullobject
2radius_mean569non-nullfloat64
3texture_mean569non-nullfloat64
4perimeter_mean569non-nullfloat64
5area_mean569non-nullfloat64
6smoothness_mean569non-nullfloat64
7compactness_mean569non-nullfloat64
8concavity_mean569non-nullfloat64
9concavepoints_mean569non-nullfloat64
10symmetry_mean569non-nullfloat64
11fractal_dimension_mean569non-nullfloat64
12radius_se569non-nullfloat64
13texture_se569non-nullfloat64
14perimeter_se569non-nullfloat64
15area_se569non-nullfloat64
16smoothness_se569non-nullfloat64
17compactness_se569non-nullfloat64
18concavity_se569non-nullfloat64
19concavepoints_se569non-nullfloat64
20symmetry_se569non-nullfloat64
21fractal_dimension_se569non-nullfloat64
22radius_worst569non-nullfloat64
23texture_worst569non-nullfloat64
24perimeter_worst569non-nullfloat64
25area_worst569non-nullfloat64
26smoothness_worst569non-nullfloat64
27compactness_worst569non-nullfloat64
28concavity_worst569non-nullfloat64
29concavepoints_worst569non-nullfloat64
30symmetry_worst569non-nullfloat64
31fractal_dimension_worst569non-nullfloat64
dtypes:float64(30),int64(1),object(1)
memoryusage:142.4+KB
#importingtherequiredlibraies
importmatplotlib.pyplotasplt
importpandasaspd
importnumpyasnp
importseabornassns
%matplotlibinline
#herewehavecreatedthevariablecalleddf
#thenbyusingthepandaslibrarywearereadingthecsvfile
#thenwearecheckingtheinitialrowsandcolumnsofthedataset
df=pd.read_csv('breast-cancer-data.csv')
df.head()
#checkingtheshapeofthedataset
df.shape
#checkingtheinformationpresentinthedataset
df.info()
#hereweahvecreatedthevariablcalledfeaturesnames
#byusingthenumpylibrarywehavecreatedanarrayunderthevariablecalledfeaturesnames
feature_names=np.array(['meanradius''meantexture''meanperimeter''meanarea'
'meansmoothness''meancompactness''meanconcavity'
'meanconcavepoints''meansymmetry''meanfractaldimension'
'radiuserror''textureerror''perimetererror''areaerror'
'smoothnesserror''compactnesserror''concavityerror'
StandardScaler()
PCA(n_components=2)
(569,31)
'concavepointserror''symmetryerror''fractaldimensionerror'
'worstradius''worsttexture''worstperimeter''worstarea'
'worstsmoothness''worstcompactness''worstconcavity'
'worstconcavepoints''worstsymmetry''worstfractaldimension'])
#hereweareimportingthelabelencodermoduleunderthepreprocessingfromthesklearnlibrary
fromsklearn.preprocessingimportLabelEncoder
#encodelabeldiagonsis
#M>1
#B?0
#inthedaatsetwearehavingonecategoricalcolumncalleddiagonsis
#thenwearecreatingthemethodcalledencoder
#toapplythelabelencoderonthecategoricalcolumnthatisdiagnosistoconvert
#thiscolumnintothenumericaldata
#aswehaveonlyonecolumntoencodethatisthetargetdatasowehaveusedencoder
#thenwehaveusedthefittransformedsothattheaftertheconversionofdiagnosiscolumn
#intothenumericaldataitwillgetfitintothetargerdata
target_data=df["diagnosis"]
encoder=LabelEncoder()
target_data=encoder.fit_transform(target_data)
#herewearedroppingthecolumncalleddisgnosisfromthe
#axis=1thatiscolumnandusedinplace=true
#wearedroppingthediagnosiscolumnbecausewearehavingthetargetdata
#thatishavingthetransformeddata
df.drop(["diagnosis"],axis=1,inplace=True)
#hereweareimportingtheStandardScalermoduleunderthepreprocessingfromthesklearnlibrary
#StandardScaler-itisusedtoscaledownthevaluetothesinglescale
#StandardScalerwillremovethemeanandscaledownthefeaturestounitvariance
#StandardScalerwillreducethemeanto0andvarianceto1
fromsklearn.preprocessingimportStandardScaler
#herewehavecreatedthemethodcalledscalertopasstheStandardScaler
#thenwearepassingtheentiredatathatisdfthroughfitmethodtoscaler
scaler=StandardScaler()
scaler.fit(df)
#herewehavecreatedthevariablecalledscaleddata
#thenwearetransformingtheentiredataofthescalerthatisdftoscaleddata
scaled_data=scaler.transform(df)
#hereweareimportingPCAmoduleunderthedecompositionfromthesklearnlibrary
#PCAconvertsthehighernumberofvariablestothelowernumberofvariables
#butthereshouldnotbeanylossofinformation.
fromsklearn.decompositionimportPCA
#herewehaveprovidedthen_components=2aswearehavingz1&z2
#thenwearepassingthen_componentsthriughPCAinpca
pca=PCA(n_components=2)
#herewearepasisngthescaled_datathrughfitmethodinpca
pca.fit(scaled_data)
#wecaneitherusefittransformmethodoreither
#wecanuseonebyonethatmeansfirstusethefitmethodandthenusethetransformmethod.
#x_pca-thisisthenewelycreatedvariable
#herewearetranformingthescaleddatathroughpcainx_pca
x_pca=pca.transform(scaled_data)
#herewearecheckingtheshapeofthescaleddata
scaled_data.shape
(569,2)
Text(0,0.5,'SecondPrincipalComponent')
array([[0.02291216,0.21891302,0.10384388,0.22753491,0.22104577,
0.14241471,0.2390673,0.25828025,0.26073811,0.13797774,
0.06414779,0.20611747,0.01741339,0.21144652,0.20307642,
0.01467821,0.1702884,0.15354367,0.18340675,0.04241552,
0.10249607,0.22800935,0.10451545,0.23663734,0.22493214,
0.12782441,0.20988456,0.22860218,0.2507462,0.12267993,
0.13156024],
[-0.03406849,-0.2332714,-0.0600442,-0.214589,-0.23066882,
0.18642221,0.15245473,0.06054163,-0.03416739,0.19068498,
0.36653106,-0.1059357,0.08954779,-0.08980704,-0.15277129,
0.20318988,0.23250336,0.19684608,0.12996518,0.18355863,
0.27958414,-0.21929604,-0.04550122,-0.19929599,-0.21898546,
0.17256296,0.14425364,0.09852652,-0.00753437,0.14261944,
0.27570208]])
array([0.42864701,0.18376792])
#herewearecheckingtheshapeofthex_pca
x_pca.shape
#PCAisthemethodtoreducethenumberofvariables
#byusingthePCAwearenotcreatinganymodel
#Reduced30dimensionstojust2!Let'splotthesetwodimensionsout!
#Drawinferencefromtheplot?
#hereweareplottingthefigureoffiguresizethatis9,6whichislengthandbreadthrespectively
#thenwearedoingtheslicingofthecolumnsofthepcathatiswearetakingthecolumn
#0and1fromthetargetdatathatisx_pca[:,0],x_pca[:,1]
#andthecolumnnameisthetargetdata
#andthecolormapwearetakingasvirdis
#thenwearesettingthexlabelandtheylabel
#finallyplottingthegraph
plt.figure(figsize=(9,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=target_data,cmap='viridis')
plt.xlabel('FirstPrincipalComponent')
plt.ylabel('SecondPrincipalComponent')
#herewearecheckingthecomponentsofthepca
#intheoutputwearegettingthevaluesofz1componentsandthevaluesofthez2components
pca.components_
#howmuchinformationz1andz2consisting.itiscalledastheexplainedvariance
#andcalculatingtheexplainedvariancetheoutputshowstheexplainedvarianceratio
pca.explained_variance_ratio_
(569,3)
array([0.42864701,0.18376792,0.09146436])
(559,32)
#thisisthe3principalcomponents
#herewearehavingthen_components=3whichwearepassingthroughPCAinthepca_3
#thenbyusingthefitmethodwearepassingthesacleddatathroughpca_3
#thenbyusingthetrasformedmethodwearepassingthescaleddatathroughpca_3inx_pca_3
pca_3=PCA(n_components=3)
pca_3.fit(scaled_data)
x_pca_3=pca_3.transform(scaled_data)
#herewearecheckingtheshapeofthex_pca_3
x_pca_3.shape
#herewearecalculatingtheexplainedvariancerationofthepca3
pca_3.explained_variance_ratio_
#descrimenantwecalculateforgettingthematrix
#firstwecalculatethetransposeofthematrixthenwewillcalculatethedescriminantofthematix
#LDAwillcalculatethelineardescrimenant
#lineardescrimenant-itmeanswhenwehaveallthedexcriminantlyingonthesingleplane
#usingtheLDAwecanusetheclassification
#measureofseperation:
#ifwehavelargedifferencebetweenthemeansthenwehavebetterseperability
#optimumprojection:itmeanshowthecomponentsareprojectingoneachother
#maximaandtheminimaarethetwofunctions-itisusedtoshowthelargestand
#lowestvaluesofthefunctionwithinarange
#ifwearehavingtherangeof0-1then0istheminimaand1isthemaxima
#maximaandminimaonlyworksfortherangeforadefinedfunction
#hereweareimportingtherequiredlibraries
#forbiuldingtheoptimalmodelweareusingKFold,cross_val_score,
#GridSearchCV,train_test_splitmodule
#whenwegetthedatawedivideitintothetraintestsplit
#gridsearchCV-CVstandsforcrossvalidation.gridsizewillcreatethesamplesofsamesizes.
#itwilltakethesamplesastrainandtestuntilthelastsampletraetedastestdata
#crossvalidationscore-itissimplythescoretogethowaccuratethemodelis
#kfold-itisthemethodofcalculatigtheaccuracyofthemodel
#accuracyscore-itwillgivesomenumericalvalue
#classificationreport-itwillgivetheentiresummaryoftheclassification
#confusionmatrix-itisthe2by2matrixthatis
#confusionmatrixistheimportantmatrixwhichisrequiredintheclassificationreport
importpandasaspd
importnumpyasnp
frommatplotlibimportpyplotasplt
fromsklearn.discriminant_analysisimportLinearDiscriminantAnalysis
fromsklearn.model_selectionimportKFold,cross_val_score,GridSearchCV,train_test_split
fromsklearn.metricsimportaccuracy_score,classification_report,confusion_matrix
importseabornassns
#herewehavecreatedthevariablecalleddf
#byusingthepandaslibrarywearereadingthecsvfile
df=pd.read_csv('breast-cancer-data.csv')
#herewearecheckingtheinitialrowsandcolumnsofthedataset
df=df.head(559)
#herewearecheckingtheshapeofthedataset
df.shape
#mapcategoricalvariable'diagnosis'intonumeric
#herewearemappingthediagnosiscolumnfromcategoricaltothenumerical
df.diagnosis=df.diagnosis.map({'M':1,'B':0})
#herewearecheckingtheinitialrowsandcolumnsofthedataset
df.head(5)
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
points_mean
0 842302 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001
1 842517 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869
2 84300903 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974
3 84348301 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414
4 84358402 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980
5rows×32columns
Index(['id','diagnosis','radius_mean','texture_mean','perimeter_mean',
'area_mean','smoothness_mean','compactness_mean','concavity_mean',
'concavepoints_mean','symmetry_mean','fractal_dimension_mean',
'radius_se','texture_se','perimeter_se','area_se','smoothness_se',
'compactness_se','concavity_se','concavepoints_se','symmetry_se',
'fractal_dimension_se','radius_worst','texture_worst',
'perimeter_worst','area_worst','smoothness_worst',
'compactness_worst','concavity_worst','concavepoints_worst',
'symmetry_worst','fractal_dimension_worst'],
dtype='object')
diagnosisFalse
radius_meanFalse
texture_meanFalse
perimeter_meanFalse
area_meanFalse
smoothness_meanFalse
compactness_meanFalse
concavity_meanFalse
concavepoints_meanFalse
symmetry_meanFalse
fractal_dimension_meanFalse
radius_seFalse
texture_seFalse
perimeter_seFalse
area_seFalse
smoothness_seFalse
compactness_seFalse
concavity_seFalse
concavepoints_seFalse
symmetry_seFalse
fractal_dimension_seFalse
radius_worstFalse
texture_worstFalse
perimeter_worstFalse
area_worstFalse
smoothness_worstFalse
compactness_worstFalse
concavity_worstFalse
concavepoints_worstFalse
symmetry_worstFalse
fractal_dimension_worstFalse
dtype:bool
#herewearecheckingthecolumnspresentinthedataset
df.columns
#herewearedroppingthecolumnidfromaxis=1thatiscolumnandusedinplace=true
df.drop('id',axis=1,inplace=True)
#herewearecheckingifanyofthenullvaluesarepresentinthedataset
df.isna().any()
#herewearedefiningthetrainandtestdataset
#wearedefiningtherowshere
#thenweareusingthetrainsizeof0.8means80%willbethe
#traindatasetandtestsizeis0.2means20%willbethetestdataset
#andwehavetherandomstateas120meansagainandagainifwewillrunthealgorithm
#itwillnotchangethevalues
#weareusing-1becausefromtheentirecolumnremovetheonecolumnthatis-1thatisdiagnosis
X_train,X_val,y_train,y_val=train_test_split(df.iloc[:,:-1],df['diagnosis'],
train_size=0.8,test_size=0.2,random_state=120)
#herewearenormalizingthedata
#normalizerdoesnotworkontheconceptofmeanandvariance
#hereweareimportingthenormalizierunderthepreprocessingfromthesklearnlibrary
#herewehavecreatedthemethodcallednormforpassingthenormalizier
#thenbyusingthefitmethodwearepassingtheX_trainthroughnormintheX_train_norm
LDAAccuracyis:1.0
LDAClassificationReport
precisionrecallf1-scoresupport
01.001.001.0076
11.001.001.0036
accuracy1.00112
macroavg1.001.001.00112
weightedavg1.001.001.00112
PredictedNegative PredictedPostive
ActualNegative 76 0
ActualPositive 0 36
#thenwearetransformingtheX_valpasisingthroughthenorminX_val_norm
fromsklearn.preprocessingimportNormalizer
norm=Normalizer()
norm.fit(X_train)
X_train_norm=norm.transform(X_train)
X_val_norm=norm.transform(X_val)
#herewehavecreatedthemethodcalledldaforpassingtheLinearDiscriminantAnalysis
#byusingthefitmethodwearepassingthenormalizedxtraindatasetandthe
#ytraindatasetinthelda
#thenbyusingthepredictmethodwearepredictingtheX_val_normandpassing
#itintheldapredicted
#thenweareprintingtheaccuracyofthelda
#accuracyscoreiscalculatedusingthe2componentsthatispredictedvalueand
#originalvalueoftheydatasetnotthexdataset
#thenwewillprinttheclassificationreportinwhichwewillgettheentiresummaryofthemodel
lda=LinearDiscriminantAnalysis()
lda.fit(X_train_norm,y_train)
lda_predicted=lda.predict(X_val_norm)
print('LDAAccuracyis:{}'.format(accuracy_score(y_val,lda_predicted)))
print('LDAClassificationReport')
print(classification_report(y_val,lda_predicted))
#herewearecalcuatingtheconfusionmatrix
#thenwearecreatingthedataframebyusingthepandaslibrary
#thenwearecheckingtheconfusionmatrixoflda
confusion_matrix_lda=pd.DataFrame(confusion_matrix(y_val,lda_predicted),
index=['ActualNegative','ActualPositive'],
columns=['PredictedNegative','PredictedPostive'])
confusion_matrix_lda
#intheoutputwearegetting0,0meansthereisnoerror